diff --git a/.circleci/config.yml b/.circleci/config.yml index e7496d28d..14c1f1a9c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -34,6 +34,7 @@ jobs: . venv/bin/activate echo "tox" >> requirements.txt pip install -r requirements.txt + pip install -r requirements-dev.txt - save_cache: paths: diff --git a/.gitignore b/.gitignore index 47d48b829..448f3bc50 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ dist/ flake8/ abydos.egg-info/ .settings/ +*.bak *.pyc *.log .coverage @@ -11,5 +12,3 @@ abydos.egg-info/ .tox/ binder/.ipynb_checkpoints binder/Untitled* -docs/abydos.bib.bak - diff --git a/.travis.yml b/.travis.yml index b56576c70..924a80090 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ language: python python: - 2.7 - - 3.3 - 3.4 - 3.5 - 3.6 @@ -12,9 +11,6 @@ matrix: - python: 3.7 dist: xenial sudo: true - - python: 3.8-dev - dist: xenial - sudo: true notifications: email: false @@ -22,7 +18,11 @@ notifications: # Install packages install: - if [[ $TRAVIS_PYTHON_VERSION == 2* ]]; then travis_retry pip install pyliblzma; fi + - case "$TRAVIS_PYTHON_VERSION" in 3.5|3.6|3.7) travis_retry pip install cython;; esac + - case "$TRAVIS_PYTHON_VERSION" in 3.5|3.6|3.7) travis_retry pip install paq lzss;; esac - travis_retry pip install coveralls + - travis_retry pip install -r requirements.txt + - travis_retry pip install -r requirements-dev.txt - travis_retry python setup.py install # Run test diff --git a/CODING_STANDARDS.rst b/CODING_STANDARDS.rst index 908bfde89..90e65d70d 100644 --- a/CODING_STANDARDS.rst +++ b/CODING_STANDARDS.rst @@ -1,11 +1,13 @@ CODING STANDARDS ---------------- -- nosetest will be used for testing -- flake8 will be used for best practice conformance -- pydocstyle will be used to ensure documentation style conformance to PEP257 - (for the most part) and NumPy documentation style -- black will be used to keep code style consistent +- Nosetest will be used for testing. +- Flake8 will be used for best practice conformance. +- Pydocstyle will be used to ensure documentation style conformance to PEP257 + (for the most part) and NumPy documentation style. +- Black will be used to keep code style consistent. +- 3rd party packages may be used, but must be present in both PyPI and conda + or conda-forge. They must also support all supported Python versions. ---- @@ -28,3 +30,24 @@ A git push should be performed only under the following conditions: - test coverage is 100% according to nosetests - flake8 and pydocstyle should report 0 issues - black code styling has been applied + + +Notes on architecture +~~~~~~~~~~~~~~~~~~~~~ + +As of the 0.3.6 release, each major algorithm of the compression, distance, +fingerprint, phonetic, & stemmer subpackages has been moved into a class of its +own. The distance, fingerprint, phonetic, & stemmer classes each inherit from +respectively common classes that define basic methods for these four major +types of classes. + +The old functional API for these subpackages has been retained for backwards +compatibility until the release of version 0.6, but its use is deprecated as +of version 0.4. New classes (those not present at the release of version 0.3.6) +will not be given functional API wrappers. + +Although, as of the 0.3.6 release, many of the classes that have are pre-0.3.6 +functions encapsulated in a class simply consist of a single method that +could be a static method, making these methods static is generally avoided. +As development continues, these classes will take more advantage of object +architecture to store parameters between calls and inherit from base classes. diff --git a/HISTORY.rst b/HISTORY.rst index 669649e93..9e85f5c96 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,268 @@ Release History --------------- +0.4.0 (2018-04-30) *dietrich* ++++++++++++++++++++++++++++++ + +doi: + +Version 0.4.0 focuses on distance measures, adding 211 new measures. Attempts +were made to provide normalized version for measure that did not inherently +range from 0 to 1. The other major focus was the addition of 12 tokenizers, in +service of expanding distance measure options. + +Changes: + +- Deprecated functions that merely wrap class methods to maintain API + compatibility, for removal in 0.6.0 +- Added methods to ConfusionTable to return: + - its internal representation + - false negative rate + - false omission rate + - positive & negative likelihood ratios + - diagnostic odds ratio + - error rate + - prevalence + - Jaccard index + - D-measure + - Phi coefficient + - joint, actual, & predicted entropies + - mutual information + - proficiency (uncertainty coefficient) + - information gain ratio + - dependency + - lift +- Deprecated f-measure & g-measure from ConfusionTable for removal in + 0.6.0 +- Added notes to indicate when functions, classes, & methods were added +- Added the following 12 tokenizers: + - QSkipgrams + - CharacterTokenizer + - RegexpTokenizer, WhitespaceTokenizer, & WordpunctTokenizer + - COrVClusterTokenizer, CVClusterTokenizer, & VCClusterTokenizer + - SonoriPyTokenizer & LegaliPyTokenizer + - NLTKTokenizer + - SAPSTokenizer +- Added the UnigramCorpus class & a facility for downloading data, such as + pre-processed/trained data, from storage on GitHub +- Added the Wåhlin phonetic encoding +- Added the following 211 similarity/distance/correlation measures: + - ALINE + - AMPLE + - Anderberg + - Andres & Marzo's Delta + - Average Linkage + - AZZOO + - Baroni-Urbani & Buser I + - Baroni-Urbani & Buser II + - Batagelj & Bren + - Baulieu I + - Baulieu II + - Baulieu III + - Baulieu IV + - Baulieu V + - Baulieu VI + - Baulieu VII + - Baulieu VIII + - Baulieu IX + - Baulieu X + - Baulieu XI + - Baulieu XII + - Baulieu XIII + - Baulieu XIV + - Baulieu XV + - Benini I + - Benini II + - Bennet + - Bhattacharyya + - BI-SIM + - BLEU + - Bloc kLevenshtein + - Brainerd-Robinson + - Braun-Blanquet + - Canberra + - Chord + - Clement + - Cohen's Kappa + - Cole + - Complete Linkage + - Consonni & Todeschini I + - Consonni & Todeschini II + - Consonni & Todeschini III + - Consonni & Todeschini IV + - Consonni & Todeschini V + - Cormode's LZ + - Covington + - Dennis + - Dice Asymmetric I + - Dice Asymmetric II + - Digby + - Dispersion + - Doolittle + - Dunning + - Eyraud + - Fager & McGowan + - Faith + - Fellegi-Sunter + - Fidelity + - Fleiss + - Fleiss-Levin-Paik + - FlexMetric + - Forbes I + - Forbes II + - Fossum + - FuzzyWuzzy Partial String + - FuzzyWuzzy Token Set + - FuzzyWuzzy Token Sort + - Generalized Fleiss + - Gilbert + - Gilbert & Wells + - Gini I + - Gini II + - Goodall + - Goodman & Kruskal's Lambda + - Goodman & Kruskal's Lambda-r + - Goodman & Kruskal's Tau A + - Goodman & Kruskal's Tau B + - Gower & Legendre + - Guttman's Lambda A + - Guttman's Lambda B + - Gwet's AC + - Hamann + - Harris & Lahey + - Hassanat + - Hawkins & Dotson + - Hellinger + - Higuera & Mico + - Hurlbert + - Iterative SubString + - Jaccard-NM + - Jensen-Shannon + - Johnson + - Kendall's Tau + - Kent & Foster I + - Kent & Foster II + - Koppen I + - Koppen II + - Kuder & Richardson + - Kuhns I + - Kuhns II + - Kuhns III + - Kuhns IV + - Kuhns V + - Kuhns VI + - Kuhns VII + - Kuhns VIII + - Kuhns IX + - Kuhns X + - Kuhns XI + - Kuhns XII + - Kulczynski I + - Kulczynski II + - Longest Common Prefix + - Longest Common Suffix + - Lorentzian + - Maarel + - Marking + - Marking Metric + - MASI + - Matusita + - Maxwell & Pilliner + - McConnaughey + - McEwen & Michael + - MetaLevenshtein + - Michelet + - MinHash + - Mountford + - Mean Squared Contingency + - Mutual Information + - NCD with LZSS + - NCD with PAQ9a + - Ozbay + - Pattern + - Pearson's Chi-Squared + - Pearson & Heron II + - Pearson II + - Pearson III + - Pearson's Phi + - Peirce + - Positional Q-Gram Dice + - Positional Q-Gram Jaccard + - Positional Q-Gram Overlap + - Q-Gram + - Quantitative Cosine + - Quantitative Dice + - Quantitative Jaccard + - Rees-Levenshtein + - Roberts + - Rogers & Tanimoto + - Rogot & Goldberg + - Rouge-L + - Rouge-S + - Rouge-SU + - Rouge-W + - Russell & Rao + - SAPS + - Scott's Pi + - Shape + - Shapira & Storer I + - Sift4 Extended + - Single Linkage + - Size + - Soft Cosine + - SoftTF-IDF + - Sokal & Michener + - Sokal & Sneath I + - Sokal & Sneath II + - Sokal & Sneath III + - Sokal & Sneath IV + - Sokal & Sneath V + - Sorgenfrei + - Steffensen + - Stiles + - Stuart's Tau + - Tarantula + - Tarwid + - Tetrachoric + - TF-IDF + - Tichy + - Tulloss's R + - Tulloss's S + - Tulloss's T + - Tulloss's U + - Unigram Subtuple + - Unknown A + - Unknown B + - Unknown C + - Unknown D + - Unknown E + - Unknown F + - Unknown G + - Unknown H + - Unknown I + - Unknown J + - Unknown K + - Unknown L + - Unknown M + - Upholt + - Warrens I + - Warrens II + - Warrens III + - Warrens IV + - Warrens V + - Weighted Jaccard + - Whittaker + - Yates' Chi-Squared + - YJHHR + - Yujian & Bo + - Yule's Q + - Yule's Q II + - Yule's Y +- Four intersection types are now supported for all distance measure that are + based on _TokenDistance. In addition to basic crisp intersections, soft, + fuzzy, and group linkage intersections have been provided. + + 0.3.6 (2018-11-17) *classy carl* ++++++++++++++++++++++++++++++++ @@ -97,6 +359,7 @@ Changes: - UEA-Lite Stemmer - Paice-Husk Stemmer - Schinke Latin stemmer + - S stemmer - Eliminated ._compat submodule in favor of six - Transitioned from PEP8 to flake8, etc. - Phonetic algorithms now consistently use max_length=-1 to indicate that diff --git a/Pipfile b/Pipfile new file mode 100644 index 000000000..796796e42 --- /dev/null +++ b/Pipfile @@ -0,0 +1,17 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +tox = "*" +nose = "*" +coverage = "*" +scipy = "*" +nltk = "*" +syllabipy = "*" + +[packages] +numpy = "*" +six = "*" +deprecation = "*" diff --git a/README.rst b/README.rst index 9355ee021..198045e89 100644 --- a/README.rst +++ b/README.rst @@ -12,7 +12,7 @@ Abydos +------------------+------------------------------------------------------+ | Usage | |docs| |mybinder| |license| |sourcerank| |zenodo| | +------------------+------------------------------------------------------+ -| Contribution | |cii| |waffle| |openhub| | +| Contribution | |cii| |openhub| |gh-commits| |gh-issues| |gh-stars| | +------------------+------------------------------------------------------+ | PyPI | |pypi| |pypi-dl| |pypi-ver| | +------------------+------------------------------------------------------+ @@ -71,7 +71,7 @@ Abydos :target: https://app.fossa.io/projects/git%2Bgithub.com%2Fchrislit%2Fabydos?ref=badge_shield :alt: FOSSA Status -.. |pylint| image:: https://img.shields.io/badge/Pylint-9.16/10-yellowgreen.svg +.. |pylint| image:: https://img.shields.io/badge/Pylint-8.86/10-yellowgreen.svg :target: # :alt: Pylint Score @@ -111,14 +111,19 @@ Abydos :target: https://bestpractices.coreinfrastructure.org/projects/1598 :alt: CII Best Practices -.. |waffle| image:: https://badge.waffle.io/chrislit/abydos.svg?columns=To%20Do,In%20Progress - :target: https://waffle.io/chrislit/abydos - :alt: 'Waffle.io - Columns and their card count' - .. |openhub| image:: https://www.openhub.net/p/abydosnlp/widgets/project_thin_badge.gif :target: https://www.openhub.net/p/abydosnlp :alt: OpenHUB +.. |gh-commits| image:: https://img.shields.io/github/commit-activity/y/chrislit/abydos.svg?logo=git&logoColor=white + :target: https://github.com/chrislit/abydos/graphs/commit-activity + +.. |gh-issues| image:: https://img.shields.io/github/issues-closed/chrislit/abydos.svg?logo=github&logoColor=white + :target: https://github.com/chrislit/abydos/issues?q= + +.. |gh-stars| image:: https://img.shields.io/github/stars/chrislit/abydos.svg?logo=github&logoColor=white + :target: https://github.com/chrislit/abydos/stargazers + .. |pypi| image:: https://img.shields.io/pypi/v/abydos.svg :target: https://pypi.python.org/pypi/abydos :alt: PyPI @@ -152,7 +157,7 @@ Abydos | | `Abydos NLP/IR library `_ -| Copyright 2014-2018 by Christopher C. Little +| Copyright 2014-2019 by Christopher C. Little Abydos is a library of phonetic algorithms, string distance measures & metrics, stemmers, and string fingerprinters including: @@ -268,12 +273,19 @@ Installation Required libraries: -- Numpy -- Six +- NumPy +- six +- deprecation -Recommended libraries: +Optional libraries (all available on PyPI, some available on conda or +conda-forge): +- SciPy - PylibLZMA (Python 2 only--for LZMA compression string distance metric) +- `SyllabiPy `_ +- `NLTK `_ +- `PyLZSS `_ +- `paq `_ To install Abydos (master) from Github source:: @@ -296,7 +308,7 @@ To install from `conda-forge `_:: conda install abydos -It should run on Python 2.7 and Python 3.3-3.7. +It should run on Python 2.7 and Python 3.4-3.7. Testing & Contributing ====================== diff --git a/VERSION.rst b/VERSION.rst new file mode 100644 index 000000000..1d0ba9ea1 --- /dev/null +++ b/VERSION.rst @@ -0,0 +1 @@ +0.4.0 diff --git a/abydos/__init__.py b/abydos/__init__.py index 772cb6704..d765d92bc 100644 --- a/abydos/__init__.py +++ b/abydos/__init__.py @@ -48,7 +48,7 @@ unicode_literals, ) -__version__ = '0.3.6' +__version__ = '0.4.0' __all__ = [ 'compression', diff --git a/abydos/compression/_arithmetic.py b/abydos/compression/_arithmetic.py index 503d59066..2caf0da9e 100644 --- a/abydos/compression/_arithmetic.py +++ b/abydos/compression/_arithmetic.py @@ -31,8 +31,11 @@ from collections import Counter from fractions import Fraction +from deprecation import deprecated + from six import PY3, text_type +from .. import __version__ if PY3: long = int @@ -45,6 +48,9 @@ class Arithmetic(object): This is based on Andrew Dalke's public domain implementation :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class. + + + .. versionadded:: 0.3.6 """ _probs = {} @@ -57,6 +63,9 @@ def __init__(self, text=None): text : str The training text + + .. versionadded:: 0.3.6 + """ if text is not None: self.train(text) @@ -69,6 +78,9 @@ def get_probs(self): dict The dictionary of probabilities + + .. versionadded:: 0.3.6 + """ return self._probs @@ -80,6 +92,9 @@ def set_probs(self, probs): probs : dict The dictionary of probabilities + + .. versionadded:: 0.3.6 + """ self._probs = probs @@ -128,6 +143,11 @@ def train(self, text): 'a': (Fraction(43, 45), Fraction(44, 45)), '\x00': (Fraction(44, 45), Fraction(1, 1))} + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ text = text_type(text) if '\x00' in text: @@ -170,6 +190,11 @@ def encode(self, text): >>> ac.encode('align') (16720586181, 34) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ text = text_type(text) if '\x00' in text: @@ -222,6 +247,11 @@ def decode(self, longval, nbits): >>> ac.decode(16720586181, 34) 'align' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ val = Fraction(longval, long(1) << nbits) letters = [] @@ -245,6 +275,12 @@ def decode(self, longval, nbits): return ''.join(letters) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Arithmetic.train method instead.', +) def ac_train(text): r"""Generate a probability dict from the provided text. @@ -293,10 +329,19 @@ def ac_train(text): 'a': (Fraction(43, 45), Fraction(44, 45)), '\x00': (Fraction(44, 45), Fraction(1, 1))} + + .. versionadded:: 0.1.0 + """ return Arithmetic(text).get_probs() +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Arithmetic.encode method instead.', +) def ac_encode(text, probs): """Encode a text using arithmetic coding with the provided probabilities. @@ -321,12 +366,21 @@ def ac_encode(text, probs): >>> ac_encode('align', pr) (16720586181, 34) + + .. versionadded:: 0.1.0 + """ coder = Arithmetic() coder.set_probs(probs) return coder.encode(text) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Arithmetic.decode method instead.', +) def ac_decode(longval, nbits, probs): """Decode the number to a string using the given statistics. @@ -353,6 +407,9 @@ def ac_decode(longval, nbits, probs): >>> ac_decode(16720586181, 34, pr) 'align' + + .. versionadded:: 0.1.0 + """ coder = Arithmetic() coder.set_probs(probs) diff --git a/abydos/compression/_bwt.py b/abydos/compression/_bwt.py index 38d3a43d3..ef24cf8e4 100644 --- a/abydos/compression/_bwt.py +++ b/abydos/compression/_bwt.py @@ -28,8 +28,11 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range +from .. import __version__ __all__ = ['BWT', 'bwt_decode', 'bwt_encode'] @@ -40,17 +43,31 @@ class BWT(object): The Burrows-Wheeler transform is an attempt at placing similar characters together to improve compression. Cf. :cite:`Burrows:1994`. + + .. versionadded:: 0.3.6 """ - def encode(self, word, terminator='\0'): + def __init__(self, terminator='\0'): + """Initialize BWT instance. + + Parameters + ---------- + terminator : str + A character added to signal the end of the string + + + .. versionadded:: 0.4.0 + + """ + self._terminator = terminator + + def encode(self, word): r"""Return the Burrows-Wheeler transformed form of a word. Parameters ---------- word : str The word to transform using BWT - terminator : str - A character added to signal the end of the string Returns ------- @@ -69,27 +86,34 @@ def encode(self, word, terminator='\0'): 'n\x00ilag' >>> bwt.encode('banana') 'annb\x00aa' - >>> bwt.encode('banana', '@') + + >>> bwt = BWT('@') + >>> bwt.encode('banana') 'annb@aa' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if word: - if terminator in word: + if self._terminator in word: raise ValueError( 'Specified terminator, {}, already in word.'.format( - terminator if terminator != '\0' else '\\0' + self._terminator if self._terminator != '\0' else '\\0' ) ) else: - word += terminator + word += self._terminator wordlist = sorted( word[i:] + word[:i] for i in range(len(word)) ) return ''.join([w[-1] for w in wordlist]) else: - return terminator + return self._terminator - def decode(self, code, terminator='\0'): + def decode(self, code): r"""Return a word decoded from BWT form. Parameters @@ -116,15 +140,22 @@ def decode(self, code, terminator='\0'): 'align' >>> bwt.decode('annb\x00aa') 'banana' - >>> bwt.decode('annb@aa', '@') + + >>> bwt = BWT('@') + >>> bwt.decode('annb@aa') 'banana' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if code: - if terminator not in code: + if self._terminator not in code: raise ValueError( 'Specified terminator, {}, absent from code.'.format( - terminator if terminator != '\0' else '\\0' + self._terminator if self._terminator != '\0' else '\\0' ) ) else: @@ -133,12 +164,18 @@ def decode(self, code, terminator='\0'): wordlist = sorted( code[i] + wordlist[i] for i in range(len(code)) ) - rows = [w for w in wordlist if w[-1] == terminator][0] - return rows.rstrip(terminator) + rows = [w for w in wordlist if w[-1] == self._terminator][0] + return rows.rstrip(self._terminator) else: return '' +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the BWT.encode method instead.', +) def bwt_encode(word, terminator='\0'): r"""Return the Burrows-Wheeler transformed form of a word. @@ -165,10 +202,18 @@ def bwt_encode(word, terminator='\0'): >>> bwt_encode('banana', '@') 'annb@aa' + .. versionadded:: 0.1.0 + """ - return BWT().encode(word, terminator) + return BWT(terminator).encode(word) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the BWT.decode method instead.', +) def bwt_decode(code, terminator='\0'): r"""Return a word decoded from BWT form. @@ -195,8 +240,10 @@ def bwt_decode(code, terminator='\0'): >>> bwt_decode('annb@aa', '@') 'banana' + .. versionadded:: 0.1.0 + """ - return BWT().decode(code, terminator) + return BWT(terminator).decode(code) if __name__ == '__main__': diff --git a/abydos/compression/_rle.py b/abydos/compression/_rle.py index 70038f762..00524f30c 100644 --- a/abydos/compression/_rle.py +++ b/abydos/compression/_rle.py @@ -30,8 +30,10 @@ from itertools import groupby -from ._bwt import BWT +from deprecation import deprecated +from ._bwt import BWT +from .. import __version__ __all__ = ['RLE', 'rle_decode', 'rle_encode'] @@ -45,6 +47,8 @@ class RLE(object): :cite:`rosettacode:2018`. This is licensed GFDL 1.2. Digits 0-9 cannot be in text. + + .. versionadded:: 0.3.6 """ def encode(self, text): @@ -79,6 +83,10 @@ def encode(self, text): >>> rle.encode('aaabaabababa') '3abaabababa' + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if text: text = ((len(list(g)), k) for k, g in groupby(text)) @@ -120,6 +128,10 @@ def decode(self, text): >>> rle.decode('3abaabababa') 'aaabaabababa' + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ mult = '' decoded = [] @@ -137,6 +149,12 @@ def decode(self, text): return text +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RLE.encode method instead.', +) def rle_encode(text, use_bwt=True): r"""Perform encoding of run-length-encoding (RLE). @@ -171,12 +189,20 @@ def rle_encode(text, use_bwt=True): >>> rle_encode('aaabaabababa', False) '3abaabababa' + .. versionadded:: 0.1.0 + """ if use_bwt: text = BWT().encode(text) return RLE().encode(text) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RLE.decode method instead.', +) def rle_decode(text, use_bwt=True): r"""Perform decoding of run-length-encoding (RLE). @@ -211,6 +237,8 @@ def rle_decode(text, use_bwt=True): >>> rle_decode('3abaabababa', False) 'aaabaabababa' + .. versionadded:: 0.1.0 + """ text = RLE().decode(text) if use_bwt: diff --git a/abydos/corpus/__init__.py b/abydos/corpus/__init__.py index 5123b50cb..a022e8ffb 100644 --- a/abydos/corpus/__init__.py +++ b/abydos/corpus/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -22,6 +22,7 @@ - :py:class:`Corpus` - :py:class:`NGramCorpus` +- :py:class:`UnigramCorpus` As a quick example of :py:class:`.Corpus`: @@ -33,9 +34,9 @@ [[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.']], [['And', 'then', 'it', 'slept.']], [['And', 'the', 'dog', 'ran', 'off.']]] >>> round(corp.idf('dog'), 10) -0.4771212547 +1.0986122887 >>> round(corp.idf('the'), 10) -0.1760912591 +0.4054651081 Here, each sentence is a separate "document". We can retrieve IDF values from the :py:class:`.Corpus`. The same :py:class:`.Corpus` can be used to initialize @@ -46,10 +47,7 @@ 2 >>> ngcorp.get_count('fox') 1 ->>> ngcorp.tf('the') -1.3010299956639813 ->>> ngcorp.tf('fox') -1.0 + ---- @@ -64,8 +62,9 @@ from ._corpus import Corpus from ._ngram_corpus import NGramCorpus +from ._unigram_corpus import UnigramCorpus -__all__ = ['Corpus', 'NGramCorpus'] +__all__ = ['Corpus', 'NGramCorpus', 'UnigramCorpus'] if __name__ == '__main__': diff --git a/abydos/corpus/_corpus.py b/abydos/corpus/_corpus.py index a3d43e8b7..ac3f40922 100644 --- a/abydos/corpus/_corpus.py +++ b/abydos/corpus/_corpus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -29,7 +29,7 @@ unicode_literals, ) -from math import log10 +from math import log __all__ = ['Corpus'] @@ -41,6 +41,8 @@ class Corpus(object): of documents. Each document is an ordered list of sentences in those documents. And each sentence is an ordered list of words that make up that sentence. + + .. versionadded:: 0.1.0 """ def __init__( @@ -50,6 +52,7 @@ def __init__( sent_split='\n', filter_chars='', stop_words=None, + word_tokenizer=None, ): r"""Initialize Corpus. @@ -72,6 +75,9 @@ def __init__( stop_words : list A list of words (as a tuple, set, or list) to filter out of the corpus text + word_tokenizer : _Tokenizer + A tokenizer to apply to each sentence in order to retrieve the + individual "word" tokens. If set to none, str.split() will be used. Example ------- @@ -79,6 +85,9 @@ def __init__( >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) + + .. versionadded:: 0.1.0 + """ self.corpus = [] self.doc_split = doc_split @@ -86,7 +95,12 @@ def __init__( for document in corpus_text.split(doc_split): doc = [] - for sentence in (s.split() for s in document.split(sent_split)): + for sentence in document.split(sent_split): + if word_tokenizer: + sentence = word_tokenizer.tokenize(sentence).get_list() + else: + sentence = sentence.split() + if stop_words: for word in set(stop_words): while word in sentence: @@ -121,6 +135,9 @@ def docs(self): >>> len(corp.docs()) 1 + + .. versionadded:: 0.1.0 + """ return self.corpus @@ -149,6 +166,9 @@ def paras(self): >>> len(corp.paras()) 1 + + .. versionadded:: 0.1.0 + """ return self.docs() @@ -197,6 +217,9 @@ def words(self): >>> len(corp.words()) 18 + + .. versionadded:: 0.1.0 + """ return [words for sents in self.sents() for words in sents] @@ -223,6 +246,9 @@ def docs_of_words(self): >>> len(corp.docs_of_words()) 1 + + .. versionadded:: 0.1.0 + """ return [ [words for sents in doc for words in sents] for doc in self.corpus @@ -251,6 +277,9 @@ def raw(self): >>> len(corp.raw()) 85 + + .. versionadded:: 0.1.0 + """ doc_list = [] for doc in self.corpus: @@ -288,9 +317,12 @@ def idf(self, term, transform=None): [['And', 'then', 'it', 'slept.']], [['And', 'the', 'dog', 'ran', 'off.']]] >>> round(corp.idf('dog'), 10) - 0.4771212547 + 1.0986122887 >>> round(corp.idf('the'), 10) - 0.1760912591 + 0.4054651081 + + + .. versionadded:: 0.1.0 """ docs_with_term = 0 @@ -309,7 +341,7 @@ def idf(self, term, transform=None): if docs_with_term == 0: return float('inf') - return log10(len(docs) / docs_with_term) + return log(len(docs) / docs_with_term) if __name__ == '__main__': diff --git a/abydos/corpus/_ngram_corpus.py b/abydos/corpus/_ngram_corpus.py index 216c3045d..ad39affed 100644 --- a/abydos/corpus/_ngram_corpus.py +++ b/abydos/corpus/_ngram_corpus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -30,7 +30,6 @@ from codecs import open as c_open from collections import Counter -from math import log10 from six import text_type from six.moves import range @@ -53,6 +52,8 @@ class NGramCorpus(object): level is a numeric value representing the frequency of the trigram. E.g. the trigram frequency of 'colorless green ideas' would be the value stored in ``self.ngcorpus['colorless']['green']['ideas'][None]``. + + .. versionadded:: 0.3.0 """ def __init__(self, corpus=None): @@ -76,6 +77,9 @@ def __init__(self, corpus=None): >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> ngcorp = NGramCorpus(Corpus(tqbf)) + + .. versionadded:: 0.3.0 + """ self.ngcorpus = Counter() @@ -116,6 +120,9 @@ def corpus_importer(self, corpus, n_val=1, bos='_START_', eos='_END_'): >>> ngcorp = NGramCorpus() >>> ngcorp.corpus_importer(Corpus(tqbf)) + + .. versionadded:: 0.3.0 + """ if not corpus or not isinstance(corpus, Corpus): raise TypeError('Corpus argument of the Corpus class required.') @@ -158,11 +165,14 @@ def get_count(self, ngram, corpus=None): >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> ngcorp = NGramCorpus(Corpus(tqbf)) - >>> NGramCorpus(Corpus(tqbf)).get_count('the') + >>> ngcorp.get_count('the') 2 - >>> NGramCorpus(Corpus(tqbf)).get_count('fox') + >>> ngcorp.get_count('fox') 1 + + .. versionadded:: 0.3.0 + """ if not corpus: corpus = self.ngcorpus @@ -194,6 +204,9 @@ def _add_to_ngcorpus(self, corpus, words, count): count : int Count of words + + .. versionadded:: 0.3.0 + """ if words[0] not in corpus: corpus[words[0]] = Counter() @@ -211,6 +224,9 @@ def gng_importer(self, corpus_file): corpus_file : file The Google NGram file from which to initialize the n-gram corpus + + .. versionadded:: 0.3.0 + """ with c_open(corpus_file, 'r', encoding='utf-8') as gng: for line in gng: @@ -219,44 +235,6 @@ def gng_importer(self, corpus_file): self._add_to_ngcorpus(self.ngcorpus, words, int(line[2])) - def tf(self, term): - r"""Return term frequency. - - Parameters - ---------- - term : str - The term for which to calculate tf - - Returns - ------- - float - The term frequency (tf) - - Raises - ------ - ValueError - tf can only calculate the frequency of individual words - - Examples - -------- - >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' - >>> tqbf += 'And then it slept.\n And the dog ran off.' - >>> ngcorp = NGramCorpus(Corpus(tqbf)) - >>> NGramCorpus(Corpus(tqbf)).tf('the') - 1.3010299956639813 - >>> NGramCorpus(Corpus(tqbf)).tf('fox') - 1.0 - - """ - if ' ' in term: - raise ValueError( - 'tf can only calculate the term frequency of individual words' - ) - tcount = self.get_count(term) - if tcount == 0: - return 0.0 - return 1 + log10(tcount) - if __name__ == '__main__': import doctest diff --git a/abydos/corpus/_unigram_corpus.py b/abydos/corpus/_unigram_corpus.py new file mode 100644 index 000000000..04fe95b1b --- /dev/null +++ b/abydos/corpus/_unigram_corpus.py @@ -0,0 +1,254 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.corpus._unigram_corpus. + +Unigram Corpus +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import pickle # noqa: S403 +from codecs import open as c_open +from collections import Counter, defaultdict +from math import log1p + +__all__ = ['UnigramCorpus'] + + +def _dd_default(*args): + return 0, 0 + + +class UnigramCorpus(object): + """Unigram corpus class. + + Largely intended for calculating inverse document frequence (IDF) from a + large corpus of unigram (or smaller) tokens, this class encapsulates a + dict object. Each key is a unigram token whose value is a tuple consisting + of the number of times a term appeared and the number of distinct documents + in which it appeared. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + corpus_text='', + documents=0, + word_transform=None, + word_tokenizer=None, + ): + r"""Initialize UnigramCorpus. + + Parameters + ---------- + corpus_text : str + The corpus text as a single string + documents : int + The number of documents in the corpus. If equal to 0 (the default) + then the maximum from the internal dictionary's distinct + documents count. + word_transform : function + A function to apply to each term before term tokenization and + addition to the corpus. One might use this, for example, to apply + Soundex encoding to each term. + word_tokenizer : _Tokenizer + A tokenizer to apply to each sentence in order to retrieve the + individual "word" tokens. If set to none, str.split() will be used. + + Example + ------- + >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' + >>> tqbf += 'And then it slept.\n And the dog ran off.' + >>> corp = UnigramCorpus(tqbf) + + + .. versionadded:: 0.4.0 + + """ + self.corpus = defaultdict(_dd_default) + self.transform = word_transform + self.tokenizer = word_tokenizer + self.doc_count = documents + + self.add_document(corpus_text) + + def add_document(self, doc): + """Add a new document to the corpus. + + Parameters + ---------- + doc : str + A string, representing the document to be added. + + + .. versionadded:: 0.4.0 + + """ + for word, count in Counter(doc.split()).items(): + self._add_word(word, count, 1) + self.doc_count += 1 + + def save_corpus(self, filename): + """Save the corpus to a file. + + This employs pickle to save the corpus (a defaultdict). Other + parameters of the corpus, such as its word_tokenizer, will not be + affected and should be set during initialization. + + Parameters + ---------- + filename : str + The filename to save the corpus to. + + + .. versionadded:: 0.4.0 + + """ + with open(filename, mode='wb') as pkl: + pickle.dump(self.corpus, pkl) + + def load_corpus(self, filename): + """Load the corpus from a file. + + This employs pickle to load the corpus (a defaultdict). Other + parameters of the corpus, such as its word_tokenizer, will not be + affected and should be set during initialization. + + Parameters + ---------- + filename : str + The filename to load the corpus from. + + + .. versionadded:: 0.4.0 + + """ + with open(filename, mode='rb') as pkl: + self.corpus = pickle.load(pkl) # noqa: S301 + self._update_doc_count() + + def _update_doc_count(self): + """Update document count, if necessary. + + .. versionadded:: 0.4.0 + """ + max_docs = max(self.corpus.values(), key=lambda _: _[1])[1] + self.doc_count = max(max_docs, self.doc_count) + + def _add_word(self, word, count, doc_count): + """Add a term to the corpus, possibly after tokenization. + + Parameters + ---------- + word : str + Word to add to the corpus + count : int + Count of word appearances + doc_count : int + Count of distinct documents in which word appears + + + .. versionadded:: 0.4.0 + + """ + if self.transform is not None: + word = self.transform(word) + + if self.tokenizer is not None: + tokens = self.tokenizer.tokenize(word).get_counter() + for tok in tokens: + n = tokens[tok] * count + prior_count, prior_doc_count = self.corpus[tok] + self.corpus[tok] = ( + prior_count + n, + prior_doc_count + doc_count, + ) + else: + prior_count, prior_doc_count = self.corpus[word] + self.corpus[word] = ( + prior_count + count, + prior_doc_count + doc_count, + ) + + def gng_importer(self, corpus_file): + """Fill in self.corpus from a Google NGram corpus file. + + Parameters + ---------- + corpus_file : file + The Google NGram file from which to initialize the n-gram corpus + + + .. versionadded:: 0.4.0 + + """ + with c_open(corpus_file, 'r', encoding='utf-8') as gng: + for line in gng: + line = line.rstrip().split('\t') + word = line[0] + if '_' in word: + word = word[: word.find('_')] + + self._add_word(word, int(line[2]), int(line[3])) + self._update_doc_count() + + def idf(self, term): + r"""Calculate the Inverse Document Frequency of a term in the corpus. + + Parameters + ---------- + term : str + The term to calculate the IDF of + + Returns + ------- + float + The IDF + + Examples + -------- + >>> tqbf = 'the quick brown fox jumped over the lazy dog\n\n' + >>> tqbf += 'and then it slept\n\n and the dog ran off' + >>> corp = UnigramCorpus(tqbf) + >>> round(corp.idf('dog'), 10) + 0.6931471806 + >>> round(corp.idf('the'), 10) + 0.6931471806 + + + .. versionadded:: 0.4.0 + + """ + if term in self.corpus: + count, term_doc_count = self.corpus[term] + return log1p(self.doc_count / term_doc_count) + else: + return float('inf') + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/distance/__init__.py b/abydos/distance/__init__.py index ce5116701..b6ac20697 100644 --- a/abydos/distance/__init__.py +++ b/abydos/distance/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -16,7 +16,7 @@ # You should have received a copy of the GNU General Public License # along with Abydos. If not, see . -"""abydos.distance. +r"""abydos.distance. The distance package implements string distance measure and metric classes: @@ -26,32 +26,248 @@ - Optimal String Alignment distance (:py:class:`.Levenshtein` with ``mode='osa'``) - Damerau-Levenshtein distance (:py:class:`.DamerauLevenshtein`) + - Yujian-Bo normalized edit distance (:py:class:`.YujianBo`) + - Higuera-Micó contextual normalized edit distance + (:py:class:`.HigueraMico`) - Indel distance (:py:class:`.Indel`) + - Syllable Alignment Pattern Searching similarity + (:py:class:`.distance.SAPS`) + - Meta-Levenshtein distance (:py:class:`.MetaLevenshtein`) + - Covington distance (:py:class:`.Covington`) + - ALINE distance (:py:class:`.ALINE`) + - FlexMetric distance (:py:class:`.FlexMetric`) + - BI-SIM similarity (:py:class:`.BISIM`) Hamming distance (:py:class:`.Hamming`) and the closely related Modified Language-Independent Product Name Search distance (:py:class:`.MLIPNS`) are provided. -Distance metrics developed for the US Census are included: +Block edit distances: + + - Tichy edit distance (:py:class:`.Tichy`) + - Levenshtein distance with block operations + (:py:class:`.BlockLevenshtein`) + - Rees-Levenshtein distance (:py:class:`.ReesLevenshtein`) + - Cormode's LZ distance (:py:class:`.CormodeLZ`) + - Shapira-Storer I edit distance with block moves, greedy algorithm + (:py:class:`.ShapiraStorerI`) + +Distance metrics developed for the US Census or derived from them are included: - Jaro distance (:py:class:`.JaroWinkler` with ``mode='Jaro'``) - Jaro-Winkler distance (:py:class:`.JaroWinkler`) - Strcmp95 distance (:py:class:`.Strcmp95`) + - Iterative-SubString (I-Sub) correlation + (:py:class:`.IterativeSubString`) A large set of multi-set token-based distance metrics are provided, including: - - Generalized Minkowski distance (:py:class:`.Minkowski`) - - Manhattan distance (:py:class:`.Manhattan`) - - Euclidean distance (:py:class:`.Euclidean`) + - AMPLE similarity (:py:class:`.AMPLE`) + - AZZOO similarity (:py:class:`.AZZOO`) + - Anderberg's D similarity (:py:class:`.Anderberg`) + - Andres & Marzo's Delta correlation (:py:class:`.AndresMarzoDelta`) + - Baroni-Urbani & Buser I similarity (:py:class:`.BaroniUrbaniBuserI`) + - Baroni-Urbani & Buser II correlation (:py:class:`.BaroniUrbaniBuserII`) + - Batagelj & Bren similarity (:py:class:`.BatageljBren`) + - Baulieu I distance (:py:class:`.BaulieuI`) + - Baulieu II distance (:py:class:`.BaulieuII`) + - Baulieu III distance (:py:class:`.BaulieuIII`) + - Baulieu IV distance (:py:class:`.BaulieuIV`) + - Baulieu V distance (:py:class:`.BaulieuV`) + - Baulieu VI distance (:py:class:`.BaulieuVI`) + - Baulieu VII distance (:py:class:`.BaulieuVII`) + - Baulieu VIII distance (:py:class:`.BaulieuVIII`) + - Baulieu IX distance (:py:class:`.BaulieuIX`) + - Baulieu X distance (:py:class:`.BaulieuX`) + - Baulieu XI distance (:py:class:`.BaulieuXI`) + - Baulieu XII distance (:py:class:`.BaulieuXII`) + - Baulieu XIII distance (:py:class:`.BaulieuXIII`) + - Baulieu XIV distance (:py:class:`.BaulieuXIV`) + - Baulieu XV distance (:py:class:`.BaulieuXV`) + - Benini I correlation (:py:class:`.BeniniI`) + - Benini II correlation (:py:class:`.BeniniII`) + - Bennet's S correlation (:py:class:`.Bennet`) + - Braun-Blanquet similarity (:py:class:`.BraunBlanquet`) + - Canberra distance (:py:class:`.Canberra`) - Chebyshev distance (:py:class:`.Chebyshev`) - - Generalized Tversky distance (:py:class:`.Tversky`) - - Sørensen–Dice coefficient (:py:class:`.Dice`) - - Jaccard similarity (:py:class:`.Jaccard`) - - Tanimoto coefficient (:py:meth:`.Jaccard.tanimoto_coeff`) - - Overlap distance (:py:class:`.Overlap`) + - Chord distance (:py:class:`.Chord`) + - Clement similarity (:py:class:`.Clement`) + - Cohen's Kappa similarity (:py:class:`.CohenKappa`) + - Cole correlation (:py:class:`.Cole`) + - Consonni & Todeschini I similarity (:py:class:`.ConsonniTodeschiniI`) + - Consonni & Todeschini II similarity (:py:class:`.ConsonniTodeschiniII`) + - Consonni & Todeschini III similarity (:py:class:`.ConsonniTodeschiniIII`) + - Consonni & Todeschini IV similarity (:py:class:`.ConsonniTodeschiniIV`) + - Consonni & Todeschini V correlation (:py:class:`.ConsonniTodeschiniV`) - Cosine similarity (:py:class:`.Cosine`) + - Dennis similarity (:py:class:`.Dennis`) + - Dice's Asymmetric I similarity (:py:class:`.DiceAsymmetricI`) + - Dice's Asymmetric II similarity (:py:class:`.DiceAsymmetricII`) + - Digby correlation (:py:class:`.Digby`) + - Dispersion correlation (:py:class:`.Dispersion`) + - Doolittle similarity (:py:class:`.Doolittle`) + - Dunning similarity (:py:class:`.Dunning`) + - Euclidean distance (:py:class:`.Euclidean`) + - Eyraud similarity (:py:class:`.Eyraud`) + - Fager & McGowan similarity (:py:class:`.FagerMcGowan`) + - Faith similarity (:py:class:`.Faith`) + - Fidelity similarity (:py:class:`.Fidelity`) + - Fleiss correlation (:py:class:`.Fleiss`) + - Fleiss-Levin-Paik similarity (:py:class:`.FleissLevinPaik`) + - Forbes I similarity (:py:class:`.ForbesI`) + - Forbes II correlation (:py:class:`.ForbesII`) + - Fossum similarity (:py:class:`.Fossum`) + - Generalized Fleiss correlation (:py:class:`.GeneralizedFleiss`) + - Gilbert correlation (:py:class:`.Gilbert`) + - Gilbert & Wells similarity (:py:class:`.GilbertWells`) + - Gini I correlation (:py:class:`.GiniI`) + - Gini II correlation (:py:class:`.GiniII`) + - Goodall similarity (:py:class:`.Goodall`) + - Goodman & Kruskal's Lambda similarity (:py:class:`.GoodmanKruskalLambda`) + - Goodman & Kruskal's Lambda-r correlation + (:py:class:`.GoodmanKruskalLambdaR`) + - Goodman & Kruskal's Tau A similarity (:py:class:`.GoodmanKruskalTauA`) + - Goodman & Kruskal's Tau B similarity (:py:class:`.GoodmanKruskalTauB`) + - Gower & Legendre similarity (:py:class:`.GowerLegendre`) + - Guttman Lambda A similarity (:py:class:`.GuttmanLambdaA`) + - Guttman Lambda B similarity (:py:class:`.GuttmanLambdaB`) + - Gwet's AC correlation (:py:class:`.GwetAC`) + - Hamann correlation (:py:class:`.Hamann`) + - Harris & Lahey similarity (:py:class:`.HarrisLahey`) + - Hassanat distance (:py:class:`.Hassanat`) + - Hawkins & Dotson similarity (:py:class:`.HawkinsDotson`) + - Hellinger distance (:py:class:`.Hellinger`) + - Hurlbert correlation (:py:class:`.Hurlbert`) + - Jaccard similarity (:py:class:`.Jaccard`) & + Tanimoto coefficient (:py:meth:`.Jaccard.tanimoto_coeff`) + - Jaccard-NM similarity (:py:class:`.JaccardNM`) + - Johnson similarity (:py:class:`.Johnson`) + - Kendall's Tau correlation (:py:class:`.KendallTau`) + - Kent & Foster I similarity (:py:class:`.KentFosterI`) + - Kent & Foster II similarity (:py:class:`.KentFosterII`) + - Köppen I correlation (:py:class:`.KoppenI`) + - Köppen II similarity (:py:class:`.KoppenII`) + - Kuder & Richardson correlation (:py:class:`.KuderRichardson`) + - Kuhns I correlation (:py:class:`.KuhnsI`) + - Kuhns II correlation (:py:class:`.KuhnsII`) + - Kuhns III correlation (:py:class:`.KuhnsIII`) + - Kuhns IV correlation (:py:class:`.KuhnsIV`) + - Kuhns V correlation (:py:class:`.KuhnsV`) + - Kuhns VI correlation (:py:class:`.KuhnsVI`) + - Kuhns VII correlation (:py:class:`.KuhnsVII`) + - Kuhns VIII correlation (:py:class:`.KuhnsVIII`) + - Kuhns IX correlation (:py:class:`.KuhnsIX`) + - Kuhns X correlation (:py:class:`.KuhnsX`) + - Kuhns XI correlation (:py:class:`.KuhnsXI`) + - Kuhns XII similarity (:py:class:`.KuhnsXII`) + - Kulczynski I similarity (:py:class:`.KulczynskiI`) + - Kulczynski II similarity (:py:class:`.KulczynskiII`) + - Lorentzian distance (:py:class:`.Lorentzian`) + - Maarel correlation (:py:class:`.Maarel`) + - Manhattan distance (:py:class:`.Manhattan`) + - marking distance (:py:class:`.Marking`) + - marking metric (:py:class:`.MarkingMetric`) + - MASI similarity (:py:class:`.MASI`) + - Matusita distance (:py:class:`.Matusita`) + - Maxwell & Pilliner correlation (:py:class:`.MaxwellPilliner`) + - McConnaughey correlation (:py:class:`.McConnaughey`) + - McEwen & Michael correlation (:py:class:`.McEwenMichael`) + - mean squared contingency correlation (:py:class:`.MSContingency`) + - Michael similarity (:py:class:`.Michael`) + - Michelet similarity (:py:class:`.Michelet`) + - Minkowski distance (:py:class:`.Minkowski`) + - Mountford similarity (:py:class:`.Mountford`) + - Mutual Information similarity (:py:class:`.MutualInformation`) + - Overlap distance (:py:class:`.Overlap`) + - Pattern difference (:py:class:`.Pattern`) + - Pearson & Heron II correlation (:py:class:`.PearsonHeronII`) + - Pearson II similarity (:py:class:`.PearsonII`) + - Pearson III correlation (:py:class:`.PearsonIII`) + - Pearson's Chi-Squared similarity (:py:class:`.PearsonChiSquared`) + - Pearson's Phi correlation (:py:class:`.PearsonPhi`) + - Peirce correlation (:py:class:`.Peirce`) + - q-gram distance (:py:class:`.QGram`) + - Rogers & Tanimoto similarity (:py:class:`.RogersTanimoto`) + - Rogot & Goldberg similarity (:py:class:`.RogotGoldberg`) + - Russell & Rao similarity (:py:class:`.RussellRao`) + - Scott's Pi correlation (:py:class:`.ScottPi`) + - Shape difference (:py:class:`.Shape`) + - Size difference (:py:class:`.Size`) + - Sokal & Michener similarity (:py:class:`.SokalMichener`) + - Sokal & Sneath I similarity (:py:class:`.SokalSneathI`) + - Sokal & Sneath II similarity (:py:class:`.SokalSneathII`) + - Sokal & Sneath III similarity (:py:class:`.SokalSneathIII`) + - Sokal & Sneath IV similarity (:py:class:`.SokalSneathIV`) + - Sokal & Sneath V similarity (:py:class:`.SokalSneathV`) + - Sørensen–Dice coefficient (:py:class:`.Dice`) + - Sorgenfrei similarity (:py:class:`.Sorgenfrei`) + - Steffensen similarity (:py:class:`.Steffensen`) + - Stiles similarity (:py:class:`.Stiles`) + - Stuart's Tau correlation (:py:class:`.StuartTau`) + - Tarantula similarity (:py:class:`.Tarantula`) + - Tarwid correlation (:py:class:`.Tarwid`) + - Tetrachoric correlation coefficient (:py:class:`.Tetrachronic`) + - Tulloss' R similarity (:py:class:`.TullossR`) + - Tulloss' S similarity (:py:class:`.TullossS`) + - Tulloss' T similarity (:py:class:`.TullossT`) + - Tulloss' U similarity (:py:class:`.TullossU`) + - Tversky distance (:py:class:`.Tversky`) + - Weighted Jaccard similarity (:py:class:`.WeightedJaccard`) + - Unigram subtuple similarity (:py:class:`.UnigramSubtuple`) + - Unknown A correlation (:py:class:`.UnknownA`) + - Unknown B similarity (:py:class:`.UnknownB`) + - Unknown C similarity (:py:class:`.UnknownC`) + - Unknown D similarity (:py:class:`.UnknownD`) + - Unknown E correlation (:py:class:`.UnknownE`) + - Unknown F similarity (:py:class:`.UnknownF`) + - Unknown G similarity (:py:class:`.UnknownG`) + - Unknown H similarity (:py:class:`.UnknownH`) + - Unknown I similarity (:py:class:`.UnknownI`) + - Unknown J similarity (:py:class:`.UnknownJ`) + - Unknown K distance (:py:class:`.UnknownK`) + - Unknown L similarity (:py:class:`.UnknownL`) + - Unknown M similarity (:py:class:`.UnknownM`) + - Upholt similarity (:py:class:`.Upholt`) + - Warrens I correlation (:py:class:`.WarrensI`) + - Warrens II similarity (:py:class:`.WarrensII`) + - Warrens III correlation (:py:class:`.WarrensIII`) + - Warrens IV similarity (:py:class:`.WarrensIV`) + - Warrens V similarity (:py:class:`.WarrensV`) + - Whittaker distance (:py:class:`.Whittaker`) + - Yates' Chi-Squared similarity (:py:class:`.YatesChiSquared`) + - Yule's Q correlation (:py:class:`.YuleQ`) + - Yule's Q II distance (:py:class:`.YuleQII`) + - Yule's Y correlation (:py:class:`.YuleY`) + - YJHHR distance (:py:class:`.YJHHR`) + + - Bhattacharyya distance (:py:class:`.Bhattacharyya`) + - Brainerd-Robinson similarity (:py:class:`.BrainerdRobinson`) + - Quantitative Cosine similarity (:py:class:`.QuantitativeCosine`) + - Quantitative Dice similarity (:py:class:`.QuantitativeDice`) + - Quantitative Jaccard similarity (:py:class:`.QuantitativeJaccard`) + - Roberts similarity (:py:class:`.Roberts`) + - Average linkage distance (:py:class:`.AverageLinkage`) + - Single linkage distance (:py:class:`.SingleLinkage`) + - Complete linkage distance (:py:class:`.CompleteLinkage`) + - Bag distance (:py:class:`.Bag`) + - Soft cosine similarity (:py:class:`.SoftCosine`) - Monge-Elkan distance (:py:class:`.MongeElkan`) + - TF-IDF similarity (:py:class:`.TFIDF`) + - SoftTF-IDF similarity (:py:class:`.SoftTFIDF`) + - Jensen-Shannon divergence (:py:class:`.JensenShannon`) + - Simplified Fellegi-Sunter distance (:py:class:`.FellegiSunter`) + - MinHash similarity (:py:class:`.MinHash`) + + - BLEU similarity (:py:class:`.BLEU`) + - Rouge-L similarity (:py:class:`.RougeL`) + - Rouge-W similarity (:py:class:`.RougeW`) + - Rouge-S similarity (:py:class:`.RougeS`) + - Rouge-SU similarity (:py:class:`.RougeSU`) + + - Positional Q-Gram Dice distance (:py:class:`.PositionalQGramDice`) + - Positional Q-Gram Jaccard distance (:py:class:`.PositionalQGramJaccard`) + - Positional Q-Gram Overlap distance (:py:class:`.PositionalQGramOverlap`) Three popular sequence alignment algorithms are provided: @@ -78,10 +294,19 @@ - zlib (:py:class:`.NCDzlib`) - bzip2 (:py:class:`.NCDbz2`) - lzma (:py:class:`.NCDlzma`) + - LZSS (:py:class:`.NCDlzss`) - arithmetic coding (:py:class:`.NCDarith`) + - PAQ9A (:py:class:`.NCDpaq9a`) - BWT plus RLE (:py:class:`.NCDbwtrle`) - RLE (:py:class:`.NCDrle`) +Three similarity measures from SeatGeek's FuzzyWuzzy: + + - FuzzyWuzzy Partial String similarity + (:py:class:`FuzzyWuzzyPartialString`) + - FuzzyWuzzy Token Sort similarity (:py:class:`FuzzyWuzzyTokenSort`) + - FuzzyWuzzy Token Set similarity (:py:class:`FuzzyWuzzyTokenSet`) + The remaining distance measures & metrics include: - Western Airlines' Match Rating Algorithm comparison @@ -89,9 +314,11 @@ - Editex (:py:class:`.Editex`) - Bavarian Landesamt für Statistik distance (:py:class:`.Baystat`) - Eudex distance (:py:class:`.distance.Eudex`) - - Sift4 distance (:py:class:`.Sift4` and :py:class:`.Sift4Simplest`) + - Sift4 distance (:py:class:`.Sift4`, :py:class:`.Sift4Simplest`, + :py:class:`.Sift4Extended`) - Typo distance (:py:class:`.Typo`) - Synoname (:py:class:`.Synoname`) + - Ozbay metric (:py:class:`.Ozbay`) Most of the distance and similarity measures have ``sim`` and ``dist`` methods, which return a measure that is normalized to the range :math:`[0, 1]`. The @@ -122,28 +349,143 @@ unicode_literals, ) +from ._aline import ALINE +from ._ample import AMPLE +from ._anderberg import Anderberg +from ._andres_marzo_delta import AndresMarzoDelta +from ._average_linkage import AverageLinkage +from ._azzoo import AZZOO from ._bag import Bag, bag, dist_bag, sim_bag +from ._baroni_urbani_buser_i import BaroniUrbaniBuserI +from ._baroni_urbani_buser_ii import BaroniUrbaniBuserII +from ._batagelj_bren import BatageljBren +from ._baulieu_i import BaulieuI +from ._baulieu_ii import BaulieuII +from ._baulieu_iii import BaulieuIII +from ._baulieu_iv import BaulieuIV +from ._baulieu_ix import BaulieuIX +from ._baulieu_v import BaulieuV +from ._baulieu_vi import BaulieuVI +from ._baulieu_vii import BaulieuVII +from ._baulieu_viii import BaulieuVIII +from ._baulieu_x import BaulieuX +from ._baulieu_xi import BaulieuXI +from ._baulieu_xii import BaulieuXII +from ._baulieu_xiii import BaulieuXIII +from ._baulieu_xiv import BaulieuXIV +from ._baulieu_xv import BaulieuXV from ._baystat import Baystat, dist_baystat, sim_baystat +from ._benini_i import BeniniI +from ._benini_ii import BeniniII +from ._bennet import Bennet +from ._bhattacharyya import Bhattacharyya +from ._bisim import BISIM +from ._bleu import BLEU +from ._block_levenshtein import BlockLevenshtein +from ._brainerd_robinson import BrainerdRobinson +from ._braun_blanquet import BraunBlanquet +from ._canberra import Canberra from ._chebyshev import Chebyshev, chebyshev +from ._chord import Chord +from ._clement import Clement +from ._cohen_kappa import CohenKappa +from ._cole import Cole +from ._complete_linkage import CompleteLinkage +from ._consonni_todeschini_i import ConsonniTodeschiniI +from ._consonni_todeschini_ii import ConsonniTodeschiniII +from ._consonni_todeschini_iii import ConsonniTodeschiniIII +from ._consonni_todeschini_iv import ConsonniTodeschiniIV +from ._consonni_todeschini_v import ConsonniTodeschiniV +from ._cormode_lz import CormodeLZ from ._cosine import Cosine, dist_cosine, sim_cosine +from ._covington import Covington from ._damerau_levenshtein import ( DamerauLevenshtein, damerau_levenshtein, dist_damerau, sim_damerau, ) +from ._dennis import Dennis from ._dice import Dice, dist_dice, sim_dice +from ._dice_asymmetric_i import DiceAsymmetricI +from ._dice_asymmetric_ii import DiceAsymmetricII +from ._digby import Digby +from ._dispersion import Dispersion +from ._distance import _Distance +from ._doolittle import Doolittle +from ._dunning import Dunning from ._editex import Editex, dist_editex, editex, sim_editex from ._euclidean import Euclidean, dist_euclidean, euclidean, sim_euclidean from ._eudex import Eudex, dist_eudex, eudex_hamming, sim_eudex +from ._eyraud import Eyraud +from ._fager_mcgowan import FagerMcGowan +from ._faith import Faith +from ._fellegi_sunter import FellegiSunter +from ._fidelity import Fidelity +from ._fleiss import Fleiss +from ._fleiss_levin_paik import FleissLevinPaik +from ._flexmetric import FlexMetric +from ._forbes_i import ForbesI +from ._forbes_ii import ForbesII +from ._fossum import Fossum +from ._fuzzywuzzy_partial_string import FuzzyWuzzyPartialString +from ._fuzzywuzzy_token_set import FuzzyWuzzyTokenSet +from ._fuzzywuzzy_token_sort import FuzzyWuzzyTokenSort +from ._generalized_fleiss import GeneralizedFleiss +from ._gilbert import Gilbert +from ._gilbert_wells import GilbertWells +from ._gini_i import GiniI +from ._gini_ii import GiniII +from ._goodall import Goodall +from ._goodman_kruskal_lambda import GoodmanKruskalLambda +from ._goodman_kruskal_lambda_r import GoodmanKruskalLambdaR +from ._goodman_kruskal_tau_a import GoodmanKruskalTauA +from ._goodman_kruskal_tau_b import GoodmanKruskalTauB from ._gotoh import Gotoh, gotoh +from ._gower_legendre import GowerLegendre +from ._guttman_lambda_a import GuttmanLambdaA +from ._guttman_lambda_b import GuttmanLambdaB +from ._gwet_ac import GwetAC +from ._hamann import Hamann from ._hamming import Hamming, dist_hamming, hamming, sim_hamming +from ._harris_lahey import HarrisLahey +from ._hassanat import Hassanat +from ._hawkins_dotson import HawkinsDotson +from ._hellinger import Hellinger +from ._higuera_mico import HigueraMico +from ._hurlbert import Hurlbert from ._ident import Ident, dist_ident, sim_ident from ._indel import Indel, dist_indel, indel, sim_indel +from ._iterative_substring import IterativeSubString from ._jaccard import Jaccard, dist_jaccard, sim_jaccard, tanimoto +from ._jaccard_nm import JaccardNM from ._jaro_winkler import JaroWinkler, dist_jaro_winkler, sim_jaro_winkler +from ._jensen_shannon import JensenShannon +from ._johnson import Johnson +from ._kendall_tau import KendallTau +from ._kent_foster_i import KentFosterI +from ._kent_foster_ii import KentFosterII +from ._koppen_i import KoppenI +from ._koppen_ii import KoppenII +from ._kuder_richardson import KuderRichardson +from ._kuhns_i import KuhnsI +from ._kuhns_ii import KuhnsII +from ._kuhns_iii import KuhnsIII +from ._kuhns_iv import KuhnsIV +from ._kuhns_ix import KuhnsIX +from ._kuhns_v import KuhnsV +from ._kuhns_vi import KuhnsVI +from ._kuhns_vii import KuhnsVII +from ._kuhns_viii import KuhnsVIII +from ._kuhns_x import KuhnsX +from ._kuhns_xi import KuhnsXI +from ._kuhns_xii import KuhnsXII +from ._kulczynski_i import KulczynskiI +from ._kulczynski_ii import KulczynskiII +from ._lcprefix import LCPrefix from ._lcsseq import LCSseq, dist_lcsseq, lcsseq, sim_lcsseq from ._lcsstr import LCSstr, dist_lcsstr, lcsstr, sim_lcsstr +from ._lcsuffix import LCSuffix from ._length import Length, dist_length, sim_length from ._levenshtein import ( Levenshtein, @@ -151,35 +493,135 @@ levenshtein, sim_levenshtein, ) +from ._lorentzian import Lorentzian +from ._maarel import Maarel from ._manhattan import Manhattan, dist_manhattan, manhattan, sim_manhattan +from ._marking import Marking +from ._marking_metric import MarkingMetric +from ._masi import MASI +from ._matusita import Matusita +from ._maxwell_pilliner import MaxwellPilliner +from ._mcconnaughey import McConnaughey +from ._mcewen_michael import McEwenMichael +from ._meta_levenshtein import MetaLevenshtein +from ._michelet import Michelet +from ._minhash import MinHash from ._minkowski import Minkowski, dist_minkowski, minkowski, sim_minkowski from ._mlipns import MLIPNS, dist_mlipns, sim_mlipns from ._monge_elkan import MongeElkan, dist_monge_elkan, sim_monge_elkan +from ._mountford import Mountford from ._mra import MRA, dist_mra, mra_compare, sim_mra +from ._ms_contingency import MSContingency +from ._mutual_information import MutualInformation from ._ncd_arith import NCDarith, dist_ncd_arith, sim_ncd_arith from ._ncd_bwtrle import NCDbwtrle, dist_ncd_bwtrle, sim_ncd_bwtrle from ._ncd_bz2 import NCDbz2, dist_ncd_bz2, sim_ncd_bz2 from ._ncd_lzma import NCDlzma, dist_ncd_lzma, sim_ncd_lzma +from ._ncd_lzss import NCDlzss +from ._ncd_paq9a import NCDpaq9a from ._ncd_rle import NCDrle, dist_ncd_rle, sim_ncd_rle from ._ncd_zlib import NCDzlib, dist_ncd_zlib, sim_ncd_zlib from ._needleman_wunsch import NeedlemanWunsch, needleman_wunsch from ._overlap import Overlap, dist_overlap, sim_overlap +from ._ozbay import Ozbay +from ._pattern import Pattern +from ._pearson_chi_squared import PearsonChiSquared +from ._pearson_heron_ii import PearsonHeronII +from ._pearson_ii import PearsonII +from ._pearson_iii import PearsonIII +from ._pearson_phi import PearsonPhi +from ._peirce import Peirce +from ._positional_q_gram_dice import PositionalQGramDice +from ._positional_q_gram_jaccard import PositionalQGramJaccard +from ._positional_q_gram_overlap import PositionalQGramOverlap from ._prefix import Prefix, dist_prefix, sim_prefix +from ._q_gram import QGram +from ._quantitative_cosine import QuantitativeCosine +from ._quantitative_dice import QuantitativeDice +from ._quantitative_jaccard import QuantitativeJaccard from ._ratcliff_obershelp import ( RatcliffObershelp, dist_ratcliff_obershelp, sim_ratcliff_obershelp, ) +from ._rees_levenshtein import ReesLevenshtein +from ._roberts import Roberts +from ._rogers_tanimoto import RogersTanimoto +from ._rogot_goldberg import RogotGoldberg +from ._rouge_l import RougeL +from ._rouge_s import RougeS +from ._rouge_su import RougeSU +from ._rouge_w import RougeW +from ._russell_rao import RussellRao +from ._saps import SAPS +from ._scott_pi import ScottPi +from ._shape import Shape +from ._shapira_storer_i import ShapiraStorerI from ._sift4 import Sift4, dist_sift4, sift4_common, sim_sift4 +from ._sift4_extended import Sift4Extended from ._sift4_simplest import Sift4Simplest, sift4_simplest +from ._single_linkage import SingleLinkage +from ._size import Size from ._smith_waterman import SmithWaterman, smith_waterman +from ._soft_cosine import SoftCosine +from ._softtf_idf import SoftTFIDF +from ._sokal_michener import SokalMichener +from ._sokal_sneath_i import SokalSneathI +from ._sokal_sneath_ii import SokalSneathII +from ._sokal_sneath_iii import SokalSneathIII +from ._sokal_sneath_iv import SokalSneathIV +from ._sokal_sneath_v import SokalSneathV +from ._sorgenfrei import Sorgenfrei +from ._steffensen import Steffensen +from ._stiles import Stiles from ._strcmp95 import Strcmp95, dist_strcmp95, sim_strcmp95 +from ._stuart_tau import StuartTau from ._suffix import Suffix, dist_suffix, sim_suffix from ._synoname import Synoname, synoname +from ._tarantula import Tarantula +from ._tarwid import Tarwid +from ._tetrachoric import Tetrachoric +from ._tf_idf import TFIDF +from ._tichy import Tichy +from ._token_distance import _TokenDistance +from ._tulloss_r import TullossR +from ._tulloss_s import TullossS +from ._tulloss_t import TullossT +from ._tulloss_u import TullossU from ._tversky import Tversky, dist_tversky, sim_tversky from ._typo import Typo, dist_typo, sim_typo, typo +from ._unigram_subtuple import UnigramSubtuple +from ._unknown_a import UnknownA +from ._unknown_b import UnknownB +from ._unknown_c import UnknownC +from ._unknown_d import UnknownD +from ._unknown_e import UnknownE +from ._unknown_f import UnknownF +from ._unknown_g import UnknownG +from ._unknown_h import UnknownH +from ._unknown_i import UnknownI +from ._unknown_j import UnknownJ +from ._unknown_k import UnknownK +from ._unknown_l import UnknownL +from ._unknown_m import UnknownM +from ._upholt import Upholt +from ._warrens_i import WarrensI +from ._warrens_ii import WarrensII +from ._warrens_iii import WarrensIII +from ._warrens_iv import WarrensIV +from ._warrens_v import WarrensV +from ._weighted_jaccard import WeightedJaccard +from ._whittaker import Whittaker +from ._yates_chi_squared import YatesChiSquared +from ._yjhhr import YJHHR +from ._yujian_bo import YujianBo +from ._yule_q import YuleQ +from ._yule_q_ii import YuleQII +from ._yule_y import YuleY __all__ = [ + '_Distance', + '_TokenDistance', 'sim', 'dist', 'Levenshtein', @@ -190,57 +632,262 @@ 'damerau_levenshtein', 'dist_damerau', 'sim_damerau', + 'ShapiraStorerI', + 'Marking', + 'MarkingMetric', + 'YujianBo', + 'HigueraMico', 'Indel', 'indel', 'dist_indel', 'sim_indel', + 'SAPS', + 'MetaLevenshtein', + 'Covington', + 'ALINE', + 'FlexMetric', + 'BISIM', 'Hamming', 'hamming', 'dist_hamming', 'sim_hamming', + 'MLIPNS', + 'dist_mlipns', + 'sim_mlipns', + 'Tichy', + 'BlockLevenshtein', + 'CormodeLZ', 'JaroWinkler', 'dist_jaro_winkler', 'sim_jaro_winkler', 'Strcmp95', 'dist_strcmp95', 'sim_strcmp95', - 'Minkowski', - 'minkowski', - 'dist_minkowski', - 'sim_minkowski', - 'Manhattan', - 'manhattan', - 'dist_manhattan', - 'sim_manhattan', - 'Euclidean', - 'euclidean', - 'dist_euclidean', - 'sim_euclidean', + 'IterativeSubString', + 'AMPLE', + 'AZZOO', + 'Anderberg', + 'AndresMarzoDelta', + 'BaroniUrbaniBuserI', + 'BaroniUrbaniBuserII', + 'BatageljBren', + 'BaulieuI', + 'BaulieuII', + 'BaulieuIII', + 'BaulieuIV', + 'BaulieuV', + 'BaulieuVI', + 'BaulieuVII', + 'BaulieuVIII', + 'BaulieuIX', + 'BaulieuX', + 'BaulieuXI', + 'BaulieuXII', + 'BaulieuXIII', + 'BaulieuXIV', + 'BaulieuXV', + 'BeniniI', + 'BeniniII', + 'Bennet', + 'BraunBlanquet', + 'Canberra', 'Chebyshev', 'chebyshev', - 'Tversky', - 'dist_tversky', - 'sim_tversky', + 'Chord', + 'Clement', + 'CohenKappa', + 'Cole', + 'ConsonniTodeschiniI', + 'ConsonniTodeschiniII', + 'ConsonniTodeschiniIII', + 'ConsonniTodeschiniIV', + 'ConsonniTodeschiniV', + 'Cosine', + 'dist_cosine', + 'sim_cosine', + 'Dennis', 'Dice', 'dist_dice', 'sim_dice', + 'DiceAsymmetricI', + 'DiceAsymmetricII', + 'Digby', + 'Dispersion', + 'Doolittle', + 'Dunning', + 'Euclidean', + 'euclidean', + 'dist_euclidean', + 'sim_euclidean', + 'Eyraud', + 'FagerMcGowan', + 'Faith', + 'Fidelity', + 'Fleiss', + 'FleissLevinPaik', + 'ForbesI', + 'ForbesII', + 'Fossum', + 'GeneralizedFleiss', + 'Gilbert', + 'GilbertWells', + 'GiniI', + 'GiniII', + 'Goodall', + 'GoodmanKruskalLambda', + 'GoodmanKruskalLambdaR', + 'GoodmanKruskalTauA', + 'GoodmanKruskalTauB', + 'GowerLegendre', + 'GuttmanLambdaA', + 'GuttmanLambdaB', + 'GwetAC', + 'Hamann', + 'HarrisLahey', + 'Hassanat', + 'HawkinsDotson', + 'Hellinger', + 'Hurlbert', 'Jaccard', 'dist_jaccard', 'sim_jaccard', 'tanimoto', + 'JaccardNM', + 'Johnson', + 'KendallTau', + 'KentFosterI', + 'KentFosterII', + 'KoppenI', + 'KoppenII', + 'KuderRichardson', + 'KuhnsI', + 'KuhnsII', + 'KuhnsIII', + 'KuhnsIV', + 'KuhnsV', + 'KuhnsVI', + 'KuhnsVII', + 'KuhnsVIII', + 'KuhnsIX', + 'KuhnsX', + 'KuhnsXI', + 'KuhnsXII', + 'KulczynskiI', + 'KulczynskiII', + 'Lorentzian', + 'Maarel', + 'Manhattan', + 'manhattan', + 'dist_manhattan', + 'sim_manhattan', + 'Michelet', + 'Minkowski', + 'minkowski', + 'dist_minkowski', + 'sim_minkowski', + 'MASI', + 'Matusita', + 'MaxwellPilliner', + 'McConnaughey', + 'McEwenMichael', + 'Mountford', + 'MutualInformation', + 'MSContingency', 'Overlap', 'dist_overlap', 'sim_overlap', - 'Cosine', - 'dist_cosine', - 'sim_cosine', + 'Pattern', + 'PearsonHeronII', + 'PearsonII', + 'PearsonIII', + 'PearsonChiSquared', + 'PearsonPhi', + 'Peirce', + 'QGram', + 'ReesLevenshtein', + 'RogersTanimoto', + 'RogotGoldberg', + 'RussellRao', + 'ScottPi', + 'Shape', + 'Size', + 'SokalMichener', + 'SokalSneathI', + 'SokalSneathII', + 'SokalSneathIII', + 'SokalSneathIV', + 'SokalSneathV', + 'Sorgenfrei', + 'Steffensen', + 'Stiles', + 'StuartTau', + 'Tarantula', + 'Tarwid', + 'Tetrachoric', + 'TullossR', + 'TullossS', + 'TullossT', + 'TullossU', + 'Tversky', + 'dist_tversky', + 'sim_tversky', + 'UnigramSubtuple', + 'UnknownA', + 'UnknownB', + 'UnknownC', + 'UnknownD', + 'UnknownE', + 'UnknownF', + 'UnknownG', + 'UnknownH', + 'UnknownI', + 'UnknownJ', + 'UnknownK', + 'UnknownL', + 'UnknownM', + 'Upholt', + 'WarrensI', + 'WarrensII', + 'WarrensIII', + 'WarrensIV', + 'WarrensV', + 'WeightedJaccard', + 'Whittaker', + 'YatesChiSquared', + 'YuleQ', + 'YuleQII', + 'YuleY', + 'YJHHR', + 'Bhattacharyya', + 'BrainerdRobinson', + 'QuantitativeCosine', + 'QuantitativeDice', + 'QuantitativeJaccard', + 'Roberts', + 'AverageLinkage', + 'SingleLinkage', + 'CompleteLinkage', 'Bag', 'bag', 'dist_bag', 'sim_bag', + 'SoftCosine', 'MongeElkan', 'dist_monge_elkan', 'sim_monge_elkan', + 'TFIDF', + 'SoftTFIDF', + 'JensenShannon', + 'FellegiSunter', + 'MinHash', + 'BLEU', + 'RougeL', + 'RougeW', + 'RougeS', + 'RougeSU', + 'PositionalQGramDice', + 'PositionalQGramJaccard', + 'PositionalQGramOverlap', 'NeedlemanWunsch', 'needleman_wunsch', 'SmithWaterman', @@ -255,6 +902,8 @@ 'lcsstr', 'dist_lcsstr', 'sim_lcsstr', + 'LCPrefix', + 'LCSuffix', 'RatcliffObershelp', 'dist_ratcliff_obershelp', 'sim_ratcliff_obershelp', @@ -288,6 +937,11 @@ 'NCDrle', 'dist_ncd_rle', 'sim_ncd_rle', + 'NCDpaq9a', + 'NCDlzss', + 'FuzzyWuzzyPartialString', + 'FuzzyWuzzyTokenSort', + 'FuzzyWuzzyTokenSet', 'MRA', 'mra_compare', 'dist_mra', @@ -296,9 +950,6 @@ 'editex', 'dist_editex', 'sim_editex', - 'MLIPNS', - 'dist_mlipns', - 'sim_mlipns', 'Baystat', 'dist_baystat', 'sim_baystat', @@ -308,6 +959,7 @@ 'sim_eudex', 'Sift4', 'Sift4Simplest', + 'Sift4Extended', 'sift4_common', 'sift4_simplest', 'dist_sift4', @@ -318,6 +970,7 @@ 'sim_typo', 'Synoname', 'synoname', + 'Ozbay', ] @@ -356,6 +1009,8 @@ def sim(src, tar, method=sim_levenshtein): >>> sim('ATCG', 'TAGC') 0.25 + .. versionadded:: 0.1.0 + """ if callable(method): return method(src, tar) @@ -400,6 +1055,8 @@ def dist(src, tar, method=sim_levenshtein): >>> dist('ATCG', 'TAGC') 0.75 + .. versionadded:: 0.1.0 + """ if callable(method): return 1 - method(src, tar) diff --git a/abydos/distance/_aline.py b/abydos/distance/_aline.py new file mode 100644 index 000000000..ab1bc7626 --- /dev/null +++ b/abydos/distance/_aline.py @@ -0,0 +1,1662 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._aline. + +ALINE alignment, similarity, and distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from copy import deepcopy + +from numpy import NINF +from numpy import float as np_float +from numpy import zeros as np_zeros + +from ._distance import _Distance + +__all__ = ['ALINE'] + + +class ALINE(_Distance): + r"""ALINE alignment, similarity, and distance. + + ALINE alignment was developed by + :cite:`Kondrak:2000,Kondrak:2002,Downey:2008`, and establishes an + alignment algorithm based on multivalued phonetic features and feature + salience weights. Along with the alignment itself, the algorithm produces a + term similarity score. + + :cite:`Downey:2008` develops ALINE's similarity score into a similarity + measure & distance measure: + + .. math:: + + sim_{ALINE} = \frac{2 \dot score_{ALINE}(src, tar)} + {score_{ALINE}(src, src) + score_{ALINE}(tar, tar)} + + However, because the average of the two self-similarity scores is not + guaranteed to be greater than or equal to the similarity score between + the two strings, by default, this formula is not used here in order to + guarantee that the similarity measure is bounded to [0, 1]. Instead, + Kondrak's similarity measure is employed: + + .. math:: + + sim_{ALINE} = \frac{score_{ALINE}(src, tar)} + {max(score_{ALINE}(src, src), score_{ALINE}(tar, tar))} + + + .. versionadded:: 0.4.0 + """ + + # The three dicts below are mostly copied from NLTK's implementation + # https://www.nltk.org/_modules/nltk/metrics/aline.html + # But values have been returned, as much as possible to the reference + # values supplied in Kondrak's paper. + feature_weights = { + # place + 'bilabial': 1.0, + 'labiodental': 0.95, + 'dental': 0.9, + 'alveolar': 0.85, + 'retroflex': 0.8, + 'palato-alveolar': 0.75, + 'palatal': 0.7, + 'velar': 0.6, + 'uvular': 0.5, + 'pharyngeal': 0.3, + 'glottal': 0.1, + # manner + 'stop': 1.0, + 'affricate': 0.9, + 'fricative': 0.8, + 'approximant': 0.6, + 'trill': 0.55, # not in original + 'tap': 0.5, # not in original + 'high vowel': 0.4, + 'mid vowel': 0.2, + 'low vowel': 0.0, + # high + 'high': 1.0, + 'mid': 0.5, + 'low': 0.0, + # back + 'front': 1.0, + 'central': 0.5, + 'back': 0.0, + # binary features + 'plus': 1.0, + 'minus': 0.0, + } + + v_features = { + 'syllabic', + 'nasal', + 'retroflex', + 'high', + 'back', + 'round', + 'long', + } + c_features = { + 'syllabic', + 'manner', + 'voice', + 'nasal', + 'retroflex', + 'lateral', + 'aspirated', + 'place', + } + + salience = { + 'syllabic': 5, + 'voice': 10, + 'lateral': 10, + 'high': 5, + 'manner': 50, + 'long': 1, + 'place': 40, + 'nasal': 10, + 'aspirated': 5, + 'back': 5, + 'retroflex': 10, + 'round': 5, + } + + phones_ipa = { + 'p': { + 'place': 'bilabial', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'b': { + 'place': 'bilabial', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 't': { + 'place': 'alveolar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'd': { + 'place': 'alveolar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʈ': { + 'place': 'retroflex', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɖ': { + 'place': 'retroflex', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'c': { + 'place': 'palatal', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɟ': { + 'place': 'palatal', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'k': { + 'place': 'velar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'g': { + 'place': 'velar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'q': { + 'place': 'uvular', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɢ': { + 'place': 'uvular', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʔ': { + 'place': 'glottal', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'm': { + 'place': 'bilabial', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɱ': { + 'place': 'labiodental', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'n': { + 'place': 'alveolar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɳ': { + 'place': 'retroflex', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɲ': { + 'place': 'palatal', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ŋ': { + 'place': 'velar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɴ': { + 'place': 'uvular', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʙ': { + 'place': 'bilabial', + 'manner': 'trill', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'r': { + 'place': 'alveolar', + 'manner': 'trill', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʀ': { + 'place': 'uvular', + 'manner': 'trill', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɾ': { + 'place': 'alveolar', + 'manner': 'tap', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɽ': { + 'place': 'retroflex', + 'manner': 'tap', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɸ': { + 'place': 'bilabial', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'β': { + 'place': 'bilabial', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'f': { + 'place': 'labiodental', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'v': { + 'place': 'labiodental', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'θ': { + 'place': 'dental', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ð': { + 'place': 'dental', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 's': { + 'place': 'alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'z': { + 'place': 'alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʃ': { + 'place': 'palato-alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʒ': { + 'place': 'palato-alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʂ': { + 'place': 'retroflex', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʐ': { + 'place': 'retroflex', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ç': { + 'place': 'palatal', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʝ': { + 'place': 'palatal', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'x': { + 'place': 'velar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɣ': { + 'place': 'velar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'χ': { + 'place': 'uvular', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʁ': { + 'place': 'uvular', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ħ': { + 'place': 'pharyngeal', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ʕ': { + 'place': 'pharyngeal', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'h': { + 'place': 'glottal', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɦ': { + 'place': 'glottal', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɬ': { + 'place': 'alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'plus', + 'aspirated': 'minus', + }, + 'ɮ': { + 'place': 'alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'plus', + 'aspirated': 'minus', + }, + 'ʋ': { + 'place': 'labiodental', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɹ': { + 'place': 'alveolar', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɻ': { + 'place': 'retroflex', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'j': { + 'place': 'palatal', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'ɰ': { + 'place': 'velar', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + }, + 'l': { + 'place': 'alveolar', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'plus', + 'aspirated': 'minus', + }, + 'w': { + 'place': 'velar', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'aspirated': 'minus', + 'double': 'bilabial', + }, + 'i': { + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'front', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'y': { + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'front', + 'round': 'plus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'e': { + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'front', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ø': { + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'front', + 'round': 'plus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ɛ': { + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'front', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'œ': { + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'front', + 'round': 'plus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'æ': { + 'manner': 'low vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'low', + 'back': 'front', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'a': { + 'manner': 'low vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'low', + 'back': 'front', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ɨ': { + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'central', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ʉ': { + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'central', + 'round': 'plus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ə': { + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'central', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'u': { + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'back', + 'round': 'plus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'o': { + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'back', + 'round': 'plus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ɔ': { + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'back', + 'round': 'plus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ɒ': { + 'manner': 'low vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'low', + 'back': 'back', + 'round': 'minus', + 'long': 'minus', + 'aspirated': 'minus', + }, + 'ː': {'long': 'plus', 'supplemental': True}, + 'ʰ': {'aspirated': 'plus', 'supplemental': True}, + } + + phones_kondrak = { + 'a': { + 'place': 'velar', + 'manner': 'low vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'low', + 'back': 'central', + 'round': 'minus', + }, + 'b': { + 'place': 'bilabial', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'c': { + 'place': 'alveolar', + 'manner': 'affricate', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'd': { + 'place': 'alveolar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'e': { + 'place': 'palatal', + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'front', + 'round': 'minus', + }, + 'f': { + 'place': 'labiodental', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'g': { + 'place': 'velar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'h': { + 'place': 'glottal', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'i': { + 'place': 'palatal', + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'front', + 'round': 'plus', + }, + 'j': { + 'place': 'alveolar', + 'manner': 'affricate', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'k': { + 'place': 'velar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'l': { + 'place': 'alveolar', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'plus', + }, + 'm': { + 'place': 'bilabial', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'n': { + 'place': 'alveolar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'plus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'o': { + 'place': 'velar', + 'manner': 'mid vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'mid', + 'back': 'back', + 'round': 'plus', + }, + 'p': { + 'place': 'bilabial', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'q': { + 'place': 'glottal', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'r': { + 'place': 'retroflex', + 'manner': 'approximant', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'plus', + 'lateral': 'minus', + }, + 's': { + 'place': 'alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 't': { + 'place': 'alveolar', + 'manner': 'stop', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'u': { + 'place': 'velar', + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'back', + 'round': 'plus', + }, + 'v': { + 'place': 'labiodental', + 'manner': 'fricative', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'w': { + 'place': 'velar', + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'back', + 'round': 'plus', + 'double': 'bilabial', + }, + 'x': { + 'place': 'velar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'minus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'y': { + 'place': 'velar', + 'manner': 'high vowel', + 'syllabic': 'plus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + 'high': 'high', + 'back': 'front', + 'round': 'minus', + }, + 'z': { + 'place': 'alveolar', + 'manner': 'fricative', + 'syllabic': 'minus', + 'voice': 'plus', + 'nasal': 'minus', + 'retroflex': 'minus', + 'lateral': 'minus', + }, + 'A': {'aspirated': 'plus', 'supplemental': True}, + 'B': {'back': 'back', 'supplemental': True}, + 'C': {'back': 'central', 'supplemental': True}, + 'D': {'place': 'dental', 'supplemental': True}, + 'F': {'back': 'front', 'supplemental': True}, + 'H': {'long': 'plus', 'supplemental': True}, + 'N': {'nasal': 'plus', 'supplemental': True}, + 'P': {'place': 'palatal', 'supplemental': True}, + 'R': {'round': 'plus', 'supplemental': True}, + 'S': {'manner': 'fricative', 'supplemental': True}, + 'V': {'place': 'palato-alveolar', 'supplemental': True}, + } + + def __init__( + self, + epsilon=0, + c_skip=-10, + c_sub=35, + c_exp=45, + c_vwl=10, + mode='local', + phones='aline', + normalizer=max, + **kwargs + ): + """Initialize ALINE instance. + + Parameters + ---------- + epsilon : float + The portion (out of 1.0) of the maximum ALINE score, above which + alignments are returned. If set to 0, only the alignments matching + the maximum alignment score are returned. If set to 1, all + alignments scoring 0 or higher are returned. + c_skip : int + The cost of an insertion or deletion + c_sub : int + The cost of a substitution + c_exp : int + The cost of an expansion or contraction + c_vwl : int + The additional cost of a vowel substitution, expansion, or + contraction + mode : str + Alignment mode, which can be ``local`` (default), ``global``, + ``half-local``, or ``semi-global`` + phones : str + Phonetic symbol set, which can be: + - ``aline`` selects Kondrak's original symbols set + - ``ipa`` selects IPA symbols + normalizer : function + A function that takes an list and computes a normalization term + by which the edit distance is divided (max by default). For the + normalization proposed by Downey, et al. (2008), set this to: + ``lambda x: sum(x)/len(x)`` + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(ALINE, self).__init__(**kwargs) + self._epsilon = epsilon + self._c_skip = c_skip + self._c_sub = c_sub + self._c_exp = c_exp + self._c_vwl = c_vwl + self._mode = mode + if self._mode not in {'local', 'global', 'half-local', 'semi-global'}: + self._mode = 'local' + if phones == 'ipa': + self._phones = self.phones_ipa + else: + self._phones = self.phones_kondrak + self._normalizer = normalizer + + def alignment(self, src, tar, score_only=False): + """Return the ALINE alignments of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + score_only : bool + Return the score only, not the alignments + + Returns + ------- + list(tuple(float, str, str) or float + ALINE alignments and their scores or the top score + + Examples + -------- + >>> cmp = ALINE() + >>> cmp.alignment('cat', 'hat') + [(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')] + >>> cmp.alignment('niall', 'neil') + [(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')] + >>> cmp.alignment('aluminum', 'catalan') + [(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')] + >>> cmp.alignment('atcg', 'tagc') + [(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖', + '‖ t a g ‖ c')] + + + .. versionadded:: 0.4.0 + + """ + + def _sig_skip(seg): + return self._c_skip + + def _sig_sub(seg1, seg2): + return ( + self._c_sub + - _delta(seg1, seg2) + - _sig_vwl(seg1) + - _sig_vwl(seg2) + ) + + def _sig_exp(seg1, seg2a, seg2b): + return ( + self._c_exp + - _delta(seg1, seg2a) + - _delta(seg1, seg2b) + - _sig_vwl(seg1) + - max(_sig_vwl(seg2a), _sig_vwl(seg2b)) + ) + + def _sig_vwl(seg): + return ( + 0.0 + if seg['manner'] > self.feature_weights['high vowel'] + else self._c_vwl + ) + + def _delta(seg1, seg2): + features = ( + self.c_features + if max(seg1['manner'], seg2['manner']) + > self.feature_weights['high vowel'] + else self.v_features + ) + diff = 0.0 + for f in features: + diff += ( + abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f] + ) + return diff + + def _retrieve(i, j, score, out): + def _record(score, out): + out.append(('‖', '‖')) + for i1 in range(i - 1, -1, -1): + out.append((src[i1]['segment'], '')) + for j1 in range(j - 1, -1, -1): + out.append(('', tar[j1]['segment'])) + if self._mode == 'global': + score += (i + j) * _sig_skip('') + + out = out[::-1] + + src_alignment = [] + tar_alignment = [] + + out.append(('‖', '‖')) + part = 0 + s_segment = '' + t_segment = '' + for ss, ts in out: + if ss == '‖': + if part % 2 == 0: + src_alignment.append(s_segment) + tar_alignment.append(t_segment) + s_segment = [] + t_segment = [] + else: + src_alignment.append(' '.join(s_segment)) + tar_alignment.append(' '.join(t_segment)) + s_segment = '' + t_segment = '' + part += 1 + else: + if part % 2 == 0: + s_segment += ss + t_segment += ts + else: + s_segment.append(ss + ' ' * (len(ts) - len(ss))) + t_segment.append(ts + ' ' * (len(ss) - len(ts))) + + src_alignment = ' ‖ '.join(src_alignment).strip() + tar_alignment = ' ‖ '.join(tar_alignment).strip() + + alignments.append((score, src_alignment, tar_alignment)) + return + + if s_mat[i, j] == 0: + _record(score, out) + return + else: + if ( + i > 0 + and j > 0 + and s_mat[i - 1, j - 1] + + _sig_sub(src[i - 1], tar[j - 1]) + + score + >= threshold + ): + loc_out = deepcopy(out) + loc_out.append( + (src[i - 1]['segment'], tar[j - 1]['segment']) + ) + _retrieve( + i - 1, + j - 1, + score + _sig_sub(src[i - 1], tar[j - 1]), + loc_out, + ) + loc_out.pop() + + if ( + j > 0 + and s_mat[i, j - 1] + _sig_skip(tar[j - 1]) + score + >= threshold + ): + loc_out = deepcopy(out) + loc_out.append(('-', tar[j - 1]['segment'])) + _retrieve(i, j - 1, score + _sig_skip(tar[j - 1]), loc_out) + loc_out.pop() + + if ( + i > 0 + and j > 1 + and s_mat[i - 1, j - 2] + + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]) + + score + >= threshold + ): + loc_out = deepcopy(out) + loc_out.append( + ( + src[i - 1]['segment'], + tar[j - 2]['segment'] + tar[j - 1]['segment'], + ) + ) + _retrieve( + i - 1, + j - 2, + score + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]), + loc_out, + ) + loc_out.pop() + + if ( + i > 0 + and s_mat[i - 1, j] + _sig_skip(src[i - 1]) + score + >= threshold + ): + loc_out = deepcopy(out) + loc_out.append((src[i - 1]['segment'], '-')) + _retrieve(i - 1, j, score + _sig_skip(src[i - 1]), loc_out) + loc_out.pop() + + if ( + i > 1 + and j > 0 + and s_mat[i - 2, j - 1] + + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]) + + score + >= threshold + ): + loc_out = deepcopy(out) + loc_out.append( + ( + src[i - 2]['segment'] + src[i - 1]['segment'], + tar[j - 1]['segment'], + ) + ) + _retrieve( + i - 2, + j - 1, + score + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]), + loc_out, + ) + loc_out.pop() + + sg_max = 0.0 + + src = list(src) + tar = list(tar) + + for ch in range(len(src)): + if src[ch] in self._phones: + seg = src[ch] + src[ch] = dict(self._phones[src[ch]]) + src[ch]['segment'] = seg + for ch in range(len(tar)): + if tar[ch] in self._phones: + seg = tar[ch] + tar[ch] = dict(self._phones[tar[ch]]) + tar[ch]['segment'] = seg + + src = [fb for fb in src if isinstance(fb, dict)] + tar = [fb for fb in tar if isinstance(fb, dict)] + + for i in range(1, len(src)): + if 'supplemental' in src[i]: + j = i - 1 + while j > -1: + if 'supplemental' not in src[j]: + for key, value in src[i].items(): + if key != 'supplemental': + if key == 'segment': + src[j]['segment'] += value + else: + src[j][key] = value + j = 0 + j -= 1 + src = [fb for fb in src if 'supplemental' not in fb] + + for i in range(1, len(tar)): + if 'supplemental' in tar[i]: + j = i - 1 + while j > -1: + if 'supplemental' not in tar[j]: + for key, value in tar[i].items(): + if key != 'supplemental': + if key == 'segment': + tar[j]['segment'] += value + else: + tar[j][key] = value + j = 0 + j -= 1 + tar = [fb for fb in tar if 'supplemental' not in fb] + + for i in range(len(src)): + for key in src[i].keys(): + if key != 'segment': + src[i][key] = self.feature_weights[src[i][key]] + for i in range(len(tar)): + for key in tar[i].keys(): + if key != 'segment': + tar[i][key] = self.feature_weights[tar[i][key]] + + src_len = len(src) + tar_len = len(tar) + + s_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) + + if self._mode == 'global': + for i in range(1, src_len + 1): + s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src[i - 1]) + for j in range(1, tar_len + 1): + s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar[j - 1]) + + for i in range(1, src_len + 1): + for j in range(1, tar_len + 1): + s_mat[i, j] = max( + s_mat[i - 1, j] + _sig_skip(src[i - 1]), + s_mat[i, j - 1] + _sig_skip(tar[j - 1]), + s_mat[i - 1, j - 1] + _sig_sub(src[i - 1], tar[j - 1]), + s_mat[i - 1, j - 2] + + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]) + if j > 1 + else NINF, + s_mat[i - 2, j - 1] + + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]) + if i > 1 + else NINF, + 0 if self._mode in {'local', 'half-local'} else NINF, + ) + + if s_mat[i, j] > sg_max: + if self._mode == 'semi-global': + if i == src_len or j == tar_len: + sg_max = s_mat[i, j] + else: + sg_max = s_mat[i, j] + + if self._mode in {'global', 'half-local'}: + dp_score = s_mat[src_len, tar_len] + else: + dp_score = s_mat.max() + + if score_only: + return dp_score + + threshold = (1 - self._epsilon) * dp_score + + alignments = [] + + for i in range(1, src_len + 1): + for j in range(1, tar_len + 1): + if self._mode in {'global', 'half-local'} and ( + i < src_len or j < tar_len + ): + continue + if self._mode == 'semi-global' and ( + i < src_len and j < tar_len + ): + continue + if s_mat[i, j] >= threshold: + out = [] + for j1 in range(tar_len - 1, j - 1, -1): + out.append(('', tar[j1]['segment'])) + for i1 in range(src_len - 1, i - 1, -1): + out.append((src[i1]['segment'], '')) + out.append(('‖', '‖')) + _retrieve(i, j, 0, out) + + def _first_element(x): + return x[0] + + return sorted(alignments, key=_first_element, reverse=True) + + def sim_score(self, src, tar): + """Return the ALINE alignment score of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + ALINE alignment score + + Examples + -------- + >>> cmp = ALINE() + >>> cmp.sim_score('cat', 'hat') + 50.0 + >>> cmp.sim_score('niall', 'neil') + 90.0 + >>> cmp.sim_score('aluminum', 'catalan') + 81.5 + >>> cmp.sim_score('atcg', 'tagc') + 65.0 + + + .. versionadded:: 0.4.0 + + """ + if src == '' and tar == '': + return 1.0 + return self.alignment(src, tar, score_only=True) + + def sim(self, src, tar): + """Return the normalized ALINE similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Normalized ALINE similarity + + Examples + -------- + >>> cmp = ALINE() + >>> cmp.dist('cat', 'hat') + 0.4117647058823529 + >>> cmp.dist('niall', 'neil') + 0.33333333333333337 + >>> cmp.dist('aluminum', 'catalan') + 0.5925 + >>> cmp.dist('atcg', 'tagc') + 0.45833333333333337 + + + .. versionadded:: 0.4.0 + + """ + num = self.sim_score(src, tar) + if num: + return num / self._normalizer( + [self.sim_score(src, src), self.sim_score(tar, tar)] + ) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/distance/_ample.py b/abydos/distance/_ample.py new file mode 100644 index 000000000..933b3df80 --- /dev/null +++ b/abydos/distance/_ample.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._ample. + +AMPLE similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['AMPLE'] + + +class AMPLE(_TokenDistance): + r"""AMPLE similarity. + + The AMPLE similarity :cite:`Dallmeier:2005,Abreu:2007` is defined in + getAverageSequenceWeight() in the AverageSequenceWeightEvaluator.java file + of AMPLE's source code. For two sets X and Y and a population N, it is + + .. math:: + + sim_{AMPLE}(X, Y) = + \big|\frac{|X \cap Y|}{|X|} - + \frac{|Y \setminus X|}{|N \setminus X|}\big| + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{AMPLE} = + \big|\frac{a}{a+b}-\frac{c}{c+d}\big| + + Notes + ----- + This measure is asymmetric. The first ratio considers how similar the two + strings are, while the second considers how dissimilar the second string + is. As a result, both very similar and very dissimilar strings will score + high on this measure, provided the unique aspects are present chiefly + in the latter string. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize AMPLE instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(AMPLE, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the AMPLE similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + AMPLE similarity + + Examples + -------- + >>> cmp = AMPLE() + >>> cmp.sim('cat', 'hat') + 0.49743589743589745 + >>> cmp.sim('Niall', 'Neil') + 0.32947729220222793 + >>> cmp.sim('aluminum', 'Catalan') + 0.10209049255441008 + >>> cmp.sim('ATCG', 'TAGC') + 0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + # If the denominators are 0, set them to 1. + # This is a deviation from the formula, but prevents division by zero + # while retaining the contribution of the other ratio. + if a + b == 0: + b = 1 + if c + d == 0: + d = 1 + + return abs((a / (a + b)) - (c / (c + d))) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_anderberg.py b/abydos/distance/_anderberg.py new file mode 100644 index 000000000..ad6320e03 --- /dev/null +++ b/abydos/distance/_anderberg.py @@ -0,0 +1,218 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._anderberg. + +Anderberg's d +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Anderberg'] + + +class Anderberg(_TokenDistance): + r"""Anderberg's D. + + For two sets X and Y and a population N, Anderberg's D + :cite:`Anderberg:1973` is + + .. math:: + + t_1 = max(|X \cap Y|, |X \setminus Y|)+ + max(|Y \setminus X|, |(N \setminus X) \setminus Y|)+\\ + max(|X \cap Y|, |Y \setminus X|)+ + max(|X \setminus Y|, |(N \setminus X) \setminus Y|)\\ + \\ + t_2 = max(|Y|, |N \setminus Y|)+max(|X|, |N \setminus X|)\\ + \\ + sim_{Anderberg}(X, Y) = + \frac{t_1-t_2}{2|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Anderberg} = + \frac{(max(a,b)+max(c,d)+max(a,c)+max(b,d))- + (max(a+b,b+d)+max(a+b,c+d))}{2n} + + Notes + ----- + There are various references to another "Anderberg similarity", + :math:`sim_{Anderberg} = \frac{8a}{8a+b+c}`, but I cannot substantiate + the claim that this appears in :cite:`Anderberg:1973`. In any case, + if you want to use this measure, you may instatiate + :py:class:`WeightedJaccard` with `weight=8`. + + Anderberg states that "[t]his quantity is the actual reduction in the + error probability (also the actual increase in the correct prediction) as + a consequence of using predictor information" :cite:`Anderberg:1973`. It + ranges [0, 0.5] so a ``sim`` method ranging [0, 1] is provided in addition + to ``sim_score``, which gives the value D itself. + + It is difficult to term this measure a similarity score. Identical strings + often fail to gain high scores. Also, strings that would otherwise be + considered quite similar often earn lower scores than those that are less + similar. + + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Anderberg instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Anderberg, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Anderberg's D similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Anderberg similarity + + Examples + -------- + >>> cmp = Anderberg() + >>> cmp.sim_score('cat', 'hat') + 0.0 + >>> cmp.sim_score('Niall', 'Neil') + 0.0 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.0 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = (max(a, b) + max(c, d) + max(a, c) + max(b, d)) - ( + max(a + c, b + d) + max(a + b, c + d) + ) + + if num == 0.0: + return 0.0 + return num / (2 * (a + b + c + d)) + + def sim(self, src, tar): + """Return the normalized Anderberg's D similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Anderberg similarity + + Examples + -------- + >>> cmp = Anderberg() + >>> cmp.sim('cat', 'hat') + 0.0 + >>> cmp.sim('Niall', 'Neil') + 0.0 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return 2 * self.sim_score(src, tar) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_andres_marzo_delta.py b/abydos/distance/_andres_marzo_delta.py new file mode 100644 index 000000000..07dd2aff1 --- /dev/null +++ b/abydos/distance/_andres_marzo_delta.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._andres_marzo_delta. + +Andres & Marzo's Delta correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['AndresMarzoDelta'] + + +class AndresMarzoDelta(_TokenDistance): + r"""Andres & Marzo's Delta correlation. + + For two sets X and Y and a population N, Andres & Marzo's :math:`\Delta` + correlation :cite:`Andres:2004` is + + .. math:: + + corr_{AndresMarzo_\Delta}(X, Y) = \Delta = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y| - + 2\sqrt{|X \setminus Y| \cdot |Y \setminus X|}}{|N|} + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{AndresMarzo_\Delta} = \Delta = + \frac{a+d-2\sqrt{b \cdot c}}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize AndresMarzoDelta instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(AndresMarzoDelta, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Andres & Marzo's Delta correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Andres & Marzo's Delta correlation + + Examples + -------- + >>> cmp = AndresMarzoDelta() + >>> cmp.corr('cat', 'hat') + 0.9897959183673469 + >>> cmp.corr('Niall', 'Neil') + 0.9822344346552608 + >>> cmp.corr('aluminum', 'Catalan') + 0.9618259496215341 + >>> cmp.corr('ATCG', 'TAGC') + 0.9744897959183674 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + num = a + d - 2 * (b * c) ** 0.5 + + if num == 0.0: + return 0.0 + return num / n + + def sim(self, src, tar): + """Return the Andres & Marzo's Delta similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Andres & Marzo's Delta similarity + + Examples + -------- + >>> cmp = AndresMarzoDelta() + >>> cmp.sim('cat', 'hat') + 0.9948979591836735 + >>> cmp.sim('Niall', 'Neil') + 0.9911172173276304 + >>> cmp.sim('aluminum', 'Catalan') + 0.980912974810767 + >>> cmp.sim('ATCG', 'TAGC') + 0.9872448979591837 + + + .. versionadded:: 0.4.0 + + """ + return (self.corr(src, tar) + 1) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_average_linkage.py b/abydos/distance/_average_linkage.py new file mode 100644 index 000000000..7aaa41752 --- /dev/null +++ b/abydos/distance/_average_linkage.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._average_linkage. + +Average linkage distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._levenshtein import Levenshtein +from ._token_distance import _TokenDistance + +__all__ = ['AverageLinkage'] + + +class AverageLinkage(_TokenDistance): + r"""Average linkage distance. + + For two lists of tokens X and Y, average linkage distance + :cite:`Deza:2016` is + + .. math:: + + dist_{AverageLinkage}(X, Y) = + \frac{\sum_{i \in X} \sum_{j \in Y} dist(X_i, Y_j)}{|X| \cdot |Y|} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, metric=None, **kwargs): + """Initialize AverageLinkage instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. (Defaults to Levenshtein distance) + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(AverageLinkage, self).__init__(tokenizer=tokenizer, **kwargs) + if metric is None: + self._metric = Levenshtein() + else: + self._metric = metric + + def dist(self, src, tar): + """Return the average linkage distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + average linkage distance + + Examples + -------- + >>> cmp = AverageLinkage() + >>> cmp.dist('cat', 'hat') + 0.8125 + >>> cmp.dist('Niall', 'Neil') + 0.8333333333333334 + >>> cmp.dist('aluminum', 'Catalan') + 0.9166666666666666 + >>> cmp.dist('ATCG', 'TAGC') + 0.8 + + + .. versionadded:: 0.4.0 + + """ + if not src and not tar: + return 0.0 + + src = self.params['tokenizer'].tokenize(src).get_list() + tar = self.params['tokenizer'].tokenize(tar).get_list() + + if not src or not tar: + return 1.0 + + num = 0.0 + den = len(src) * len(tar) + + for term_src in src: + for term_tar in tar: + num += self._metric.dist(term_src, term_tar) + + return num / den + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_azzoo.py b/abydos/distance/_azzoo.py new file mode 100644 index 000000000..5d80a855a --- /dev/null +++ b/abydos/distance/_azzoo.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._azzoo. + +AZZOO similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['AZZOO'] + + +class AZZOO(_TokenDistance): + r"""AZZOO similarity. + + For two sets X and Y, and alphabet N, and a parameter :math:`\sigma`, + AZZOO similarity :cite:`Cha:2006` is + + .. math:: + + sim_{AZZOO_{\sigma}}(X, Y) = + \sum{s_i} + + where :math:`s_i = 1` if :math:`X_i = Y_i = 1`, + :math:`s_i = \sigma` if :math:`X_i = Y_i = 0`, + and :math:`s_i = 0` otherwise. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{AZZOO} = a + \sigma \cdot d + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + sigma=0.5, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize AZZOO instance. + + Parameters + ---------- + sigma : float + Sigma designates the contribution to similarity given by the + 0-0 samples in the set. + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(AZZOO, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + self.set_params(sigma=sigma) + + def sim_score(self, src, tar): + """Return the AZZOO similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + AZZOO similarity + + Examples + -------- + >>> cmp = AZZOO() + >>> cmp.sim_score('cat', 'hat') + 391.0 + >>> cmp.sim_score('Niall', 'Neil') + 389.5 + >>> cmp.sim_score('aluminum', 'Catalan') + 385.5 + >>> cmp.sim_score('ATCG', 'TAGC') + 387.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + d = self._total_complement_card() + + return a + self.params['sigma'] * d + + def sim(self, src, tar): + """Return the AZZOO similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + AZZOO similarity + + Examples + -------- + >>> cmp = AZZOO() + >>> cmp.sim('cat', 'hat') + 0.9923857868020305 + >>> cmp.sim('Niall', 'Neil') + 0.9860759493670886 + >>> cmp.sim('aluminum', 'Catalan') + 0.9710327455919395 + >>> cmp.sim('ATCG', 'TAGC') + 0.9809885931558935 + + + .. versionadded:: 0.4.0 + + """ + den = max(self.sim_score(src, src), self.sim_score(tar, tar)) + if den == 0.0: + return 1.0 + + return self.sim_score(src, tar) / den + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_bag.py b/abydos/distance/_bag.py index dc1f98e20..be3eb17ff 100644 --- a/abydos/distance/_bag.py +++ b/abydos/distance/_bag.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,9 +28,11 @@ unicode_literals, ) -from collections import Counter +from deprecation import deprecated from ._token_distance import _TokenDistance +from .. import __version__ +from ..tokenizer import CharacterTokenizer __all__ = ['Bag', 'bag', 'dist_bag', 'sim_bag'] @@ -38,11 +40,54 @@ class Bag(_TokenDistance): """Bag distance. - Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as: - :math:`max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|)`. + Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as + + .. math:: + + dist_{bag}(src, tar) = + max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|) + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar): + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Bag instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + if tokenizer is None: + tokenizer = CharacterTokenizer() + super(Bag, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def dist_abs(self, src, tar, normalized=False): """Return the bag distance between two strings. Parameters @@ -51,10 +96,12 @@ def dist_abs(self, src, tar): Source string for comparison tar : str Target string for comparison + normalized : bool + Normalizes to [0, 1] if True Returns ------- - int + int or float Bag distance Examples @@ -73,6 +120,11 @@ def dist_abs(self, src, tar): >>> cmp.dist_abs('abcdefg', 'hijklmno') 8 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if tar == src: return 0 @@ -81,12 +133,14 @@ def dist_abs(self, src, tar): elif not tar: return len(src) - src_bag = Counter(src) - tar_bag = Counter(tar) - return max( - sum((src_bag - tar_bag).values()), - sum((tar_bag - src_bag).values()), - ) + self._tokenize(src, tar) + + dist = max(self._src_only_card(), self._tar_only_card()) + + if normalized: + dist /= max(self._src_card(), self._tar_card()) + + return dist def dist(self, src, tar): """Return the normalized bag distance between two strings. @@ -117,17 +171,26 @@ def dist(self, src, tar): >>> cmp.dist('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if tar == src: return 0.0 if not src or not tar: return 1.0 - max_length = max(len(src), len(tar)) - - return self.dist_abs(src, tar) / max_length + return self.dist_abs(src, tar, normalized=True) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Bag.dist_abs method instead.', +) def bag(src, tar): """Return the bag distance between two strings. @@ -160,10 +223,18 @@ def bag(src, tar): >>> bag('abcdefg', 'hijklmno') 8 + .. versionadded:: 0.1.0 + """ return Bag().dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Bag.dist method instead.', +) def dist_bag(src, tar): """Return the normalized bag distance between two strings. @@ -192,10 +263,18 @@ def dist_bag(src, tar): >>> dist_bag('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ return Bag().dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Bag.sim method instead.', +) def sim_bag(src, tar): """Return the normalized bag similarity of two strings. @@ -224,6 +303,8 @@ def sim_bag(src, tar): >>> sim_bag('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ return Bag().sim(src, tar) diff --git a/abydos/distance/_baroni_urbani_buser_i.py b/abydos/distance/_baroni_urbani_buser_i.py new file mode 100644 index 000000000..b198b744a --- /dev/null +++ b/abydos/distance/_baroni_urbani_buser_i.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baroni_urbani_buser_i. + +Baroni-Urbani & Buser I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaroniUrbaniBuserI'] + + +class BaroniUrbaniBuserI(_TokenDistance): + r"""Baroni-Urbani & Buser I similarity. + + For two sets X and Y and a population N, the Baroni-Urbani & Buser I + similarity :cite:`BaroniUrbani:1976` is + + .. math:: + + sim_{BaroniUrbaniBuserI}(X, Y) = + \frac{\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + |X \cap Y|} + {\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + |X \cap Y| + |X \setminus Y| + |Y \setminus X|} + + This is the second, but more commonly used and referenced of the two + similarities proposed by Baroni-Urbani & Buser. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{BaroniUrbaniBuserI} = + \frac{\sqrt{ad}+a}{\sqrt{ad}+a+b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaroniUrbaniBuserI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaroniUrbaniBuserI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Baroni-Urbani & Buser I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baroni-Urbani & Buser I similarity + + Examples + -------- + >>> cmp = BaroniUrbaniBuserI() + >>> cmp.sim('cat', 'hat') + 0.9119837740878104 + >>> cmp.sim('Niall', 'Neil') + 0.8552823175014205 + >>> cmp.sim('aluminum', 'Catalan') + 0.656992712054851 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + return ((a * d) ** 0.5 + a) / ((a * d) ** 0.5 + a + b + c) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baroni_urbani_buser_ii.py b/abydos/distance/_baroni_urbani_buser_ii.py new file mode 100644 index 000000000..4427d294c --- /dev/null +++ b/abydos/distance/_baroni_urbani_buser_ii.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baroni_urbani_buser_ii. + +Baroni-Urbani & Buser II correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaroniUrbaniBuserII'] + + +class BaroniUrbaniBuserII(_TokenDistance): + r"""Baroni-Urbani & Buser II correlation. + + For two sets X and Y and a population N, the Baroni-Urbani & Buser II + correlation :cite:`BaroniUrbani:1976` is + + .. math:: + + corr_{BaroniUrbaniBuserII}(X, Y) = + \frac{\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + |X \cap Y| - |X \setminus Y| - |Y \setminus X|} + {\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + |X \cap Y| + |X \setminus Y| + |Y \setminus X|} + + This is the first, but less commonly used and referenced of the two + similarities proposed by Baroni-Urbani & Buser. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{BaroniUrbaniBuserII} = + \frac{\sqrt{ad}+a-b-c}{\sqrt{ad}+a+b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaroniUrbaniBuserII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaroniUrbaniBuserII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Baroni-Urbani & Buser II correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baroni-Urbani & Buser II correlation + + Examples + -------- + >>> cmp = BaroniUrbaniBuserII() + >>> cmp.corr('cat', 'hat') + 0.8239675481756209 + >>> cmp.corr('Niall', 'Neil') + 0.7105646350028408 + >>> cmp.corr('aluminum', 'Catalan') + 0.31398542410970204 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + return ((a * d) ** 0.5 + a - b - c) / ((a * d) ** 0.5 + a + b + c) + + def sim(self, src, tar): + """Return the Baroni-Urbani & Buser II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baroni-Urbani & Buser II similarity + + Examples + -------- + >>> cmp = BaroniUrbaniBuserII() + >>> cmp.sim('cat', 'hat') + 0.9119837740878105 + >>> cmp.sim('Niall', 'Neil') + 0.8552823175014204 + >>> cmp.sim('aluminum', 'Catalan') + 0.656992712054851 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (self.corr(src, tar) + 1) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_batagelj_bren.py b/abydos/distance/_batagelj_bren.py new file mode 100644 index 000000000..d5858f8b7 --- /dev/null +++ b/abydos/distance/_batagelj_bren.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._batagelj_bren. + +Batagelj & Bren distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BatageljBren'] + + +class BatageljBren(_TokenDistance): + r"""Batagelj & Bren distance. + + For two sets X and Y and a population N, the Batagelj & Bren + distance :cite:`Batagelj:1995`, Batagelj & Bren's :math:`Q_0`, is + + .. math:: + + dist_{BatageljBren}(X, Y) = + \frac{|X \setminus Y| \cdot |Y \setminus X|} + {|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BatageljBren} = + \frac{bc}{ad} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BatageljBren instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BatageljBren, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist_abs(self, src, tar): + """Return the Batagelj & Bren distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Batagelj & Bren distance + + Examples + -------- + >>> cmp = BatageljBren() + >>> cmp.dist_abs('cat', 'hat') + 0.002570694087403599 + >>> cmp.dist_abs('Niall', 'Neil') + 0.007741935483870968 + >>> cmp.dist_abs('aluminum', 'Catalan') + 0.07282184655396619 + >>> cmp.dist_abs('ATCG', 'TAGC') + inf + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if a == 0 or d == 0: + return float('inf') + return b * c / (a * d) + + def dist(self, src, tar): + """Return the normalized Batagelj & Bren distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Batagelj & Bren distance + + Examples + -------- + >>> cmp = BatageljBren() + >>> cmp.dist('cat', 'hat') + 3.2789465400556106e-06 + >>> cmp.dist('Niall', 'Neil') + 9.874917709019092e-06 + >>> cmp.dist('aluminum', 'Catalan') + 9.276668350823718e-05 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if a == 0 or d == 0: + return 1.0 + return (b * c / (a * d)) / (a + b + c + d) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_i.py b/abydos/distance/_baulieu_i.py new file mode 100644 index 000000000..e83b584cc --- /dev/null +++ b/abydos/distance/_baulieu_i.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_i. + +Baulieu I distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuI'] + + +class BaulieuI(_TokenDistance): + r"""Baulieu I distance. + + For two sets X and Y and a population N, Baulieu I distance + :cite:`Baulieu:1989` is + + .. math:: + + sim_{BaulieuI}(X, Y) = + \frac{|X| \cdot |Y| - |X \cap Y|^2}{|X| \cdot |Y|} + + This is Baulieu's 12th dissimilarity coefficient. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{BaulieuI} = + \frac{(a+b)(a+c)-a^2}{(a+b)(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu I distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu I distance + + Examples + -------- + >>> cmp = BaulieuI() + >>> cmp.dist('cat', 'hat') + 0.75 + >>> cmp.dist('Niall', 'Neil') + 0.8666666666666667 + >>> cmp.dist('aluminum', 'Catalan') + 0.9861111111111112 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + ab = self._src_card() + ac = self._tar_card() + + num = ab * ac - a * a + + if num == 0: + return 0.0 + return num / (ab * ac) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_ii.py b/abydos/distance/_baulieu_ii.py new file mode 100644 index 000000000..9df08902f --- /dev/null +++ b/abydos/distance/_baulieu_ii.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_ii. + +Baulieu II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuII'] + + +class BaulieuII(_TokenDistance): + r"""Baulieu II similarity. + + For two sets X and Y and a population N, Baulieu II similarity + :cite:`Baulieu:1989` is + + .. math:: + + sim_{BaulieuII}(X, Y) = + \frac{|X \cap Y|^2 \cdot |(N \setminus X) \setminus Y|^2} + {|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|} + + This is based on Baulieu's 13th dissimilarity coefficient. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{BaulieuII} = + \frac{a^2d^2}{(a+b)(a+c)(b+d)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Baulieu II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu II similarity + + Examples + -------- + >>> cmp = BaulieuII() + >>> cmp.sim('cat', 'hat') + 0.24871959237343852 + >>> cmp.sim('Niall', 'Neil') + 0.13213719608444902 + >>> cmp.sim('aluminum', 'Catalan') + 0.013621892326789235 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * a * d * d + + if num == 0: + return 0.0 + return num / ((a + b) * (a + c) * (b + d) * (c + d)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_iii.py b/abydos/distance/_baulieu_iii.py new file mode 100644 index 000000000..952dacf03 --- /dev/null +++ b/abydos/distance/_baulieu_iii.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_iii. + +Baulieu III distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuIII'] + + +class BaulieuIII(_TokenDistance): + r"""Baulieu III distance. + + For two sets X and Y and a population N, Baulieu III distance + :cite:`Baulieu:1989` is + + .. math:: + + sim_{BaulieuIII}(X, Y) = + \frac{|N|^2 - 4(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)}{2 \cdot |N|^2} + + This is based on Baulieu's 20th dissimilarity coefficient. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{BaulieuIII} = + \frac{n^2 - 4(ad-bc)}{2n^2} + + Notes + ----- + It should be noted that this is *based on* Baulieu's 20th dissimilarity + coefficient. This distance is exactly half Baulieu's 20th dissimilarity. + According to :cite:`Baulieu:1989`, the 20th dissimilarity should be a + value in the range [0.0, 1.0], meeting the article's (P1) property, but the + formula given ranges [0.0, 2.0], so dividing by 2 corrects the formula to + meet the article's expectations. + + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu III distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu III distance + + Examples + -------- + >>> cmp = BaulieuIII() + >>> cmp.dist('cat', 'hat') + 0.4949500208246564 + >>> cmp.dist('Niall', 'Neil') + 0.4949955747605165 + >>> cmp.dist('aluminum', 'Catalan') + 0.49768591017891195 + >>> cmp.dist('ATCG', 'TAGC') + 0.5000813463140358 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + num = n * n - 4 * (a * d - b * c) + + if num == 0: + return 0.0 + return num / (2 * n * n) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_iv.py b/abydos/distance/_baulieu_iv.py new file mode 100644 index 000000000..4fb5d86b9 --- /dev/null +++ b/abydos/distance/_baulieu_iv.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_iv. + +Baulieu IV distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import e + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuIV'] + + +class BaulieuIV(_TokenDistance): + r"""Baulieu IV distance. + + For two sets X and Y, a population N, and a positive irractional number k, + Baulieu IV distance :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuIV}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| - + (|X \cap Y| + \frac{1}{2}) \cdot (|(N \setminus X) \setminus Y| + + \frac{1}{2}) \cdot |(N \setminus X) \setminus Y| \cdot k}{|N|} + + This is Baulieu's 22nd dissimilarity coefficient. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuIV} = \frac{b+c-(a+\frac{1}{2})(d+\frac{1}{2})dk}{n} + + Notes + ----- + The default value of k is Euler's number :math:`e`, but other irrationals + such as :math:`\pi` or :math:`\sqrt{2}` could be substituted at + initialization. + + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + positive_irrational=e, + **kwargs + ): + """Initialize BaulieuIV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuIV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + self._positive_irrational = positive_irrational + + def dist_abs(self, src, tar): + """Return the Baulieu IV distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu IV distance + + Examples + -------- + >>> cmp = BaulieuIV() + >>> cmp.dist_abs('cat', 'hat') + -5249.96272285802 + >>> cmp.dist_abs('Niall', 'Neil') + -5209.561726488335 + >>> cmp.dist_abs('aluminum', 'Catalan') + -3073.6070822721244 + >>> cmp.dist_abs('ATCG', 'TAGC') + -1039.2151656463932 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + k = self._positive_irrational + + num = (b + c) - (a + 0.5) * (d + 0.5) * d * k + + if num == 0.0: + return 0.0 + return num / n + + def dist(self, src, tar): + """Return the normalized Baulieu IV distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Baulieu IV distance + + Examples + -------- + >>> cmp = BaulieuIV() + >>> cmp.dist('cat', 'hat') + 0.49999799606535283 + >>> cmp.dist('Niall', 'Neil') + 0.49999801148659684 + >>> cmp.dist('aluminum', 'Catalan') + 0.49999883126809364 + >>> cmp.dist('ATCG', 'TAGC') + 0.4999996033268451 + + + .. versionadded:: 0.4.0 + + """ + distance = self.dist_abs(src, tar) + n3 = self._population_unique_card() ** 3 + k = self._positive_irrational + + num = distance + n3 * k + + if num == 0.0: + return 0.0 + return (distance + n3 * k) / (2 * n3 * k) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_ix.py b/abydos/distance/_baulieu_ix.py new file mode 100644 index 000000000..91979da5d --- /dev/null +++ b/abydos/distance/_baulieu_ix.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_ix. + +Baulieu IX distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuIX'] + + +class BaulieuIX(_TokenDistance): + r"""Baulieu IX distance. + + For two sets X and Y and a population N, Baulieu IX distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuIX}(X, Y) = \frac{|X \setminus Y| + 2 \cdot + |Y \setminus X|}{|N| + |Y \setminus X|} + + This is Baulieu's 27th dissimilarity coefficient. This coefficient fails + Baulieu's (P7) property, that :math:`D(a,b,c,d) = D(a,c,b,d)`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuIX} = \frac{b+2c}{a+b+2c+d} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuIX instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuIX, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu IX distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu IX distance + + Examples + -------- + >>> cmp = BaulieuIX() + >>> cmp.dist('cat', 'hat') + 0.007633587786259542 + >>> cmp.dist('Niall', 'Neil') + 0.012706480304955527 + >>> cmp.dist('aluminum', 'Catalan') + 0.027777777777777776 + >>> cmp.dist('ATCG', 'TAGC') + 0.019011406844106463 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + return (b + 2 * c) / (c + n) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_v.py b/abydos/distance/_baulieu_v.py new file mode 100644 index 000000000..a217c3c09 --- /dev/null +++ b/abydos/distance/_baulieu_v.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_v. + +Baulieu V distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuV'] + + +class BaulieuV(_TokenDistance): + r"""Baulieu V distance. + + For two sets X and Y and a population N, Baulieu V distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuV}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| + + 1}{|X \cap Y| + |X \setminus Y| + |Y \setminus X| + 1} + + This is Baulieu's 23rd dissimilarity coefficient. This coefficient fails + Baulieu's (P2) property, that :math:`D(a,0,0,0) = 0`. Rather, + :math:`D(a,0,0,0) > 0`, but + :math:`\lim_{a \to \infty} D(a,0,0,0) = 0`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuV} = \frac{b+c+1}{a+b+c+1} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu V distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu V distance + + Examples + -------- + >>> cmp = BaulieuV() + >>> cmp.dist('cat', 'hat') + 0.7142857142857143 + >>> cmp.dist('Niall', 'Neil') + 0.8 + >>> cmp.dist('aluminum', 'Catalan') + 0.9411764705882353 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return (b + c + 1) / (a + b + c + 1) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_vi.py b/abydos/distance/_baulieu_vi.py new file mode 100644 index 000000000..7e0522514 --- /dev/null +++ b/abydos/distance/_baulieu_vi.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_vi. + +Baulieu VI distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuVI'] + + +class BaulieuVI(_TokenDistance): + r"""Baulieu VI distance. + + For two sets X and Y and a population N, Baulieu VI distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuVI}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|} + {|X \cap Y| + |X \setminus Y| + |Y \setminus X| + 1} + + This is Baulieu's 24th dissimilarity coefficient. This coefficient fails + Baulieu's (P3) property, that :math:`D(a,b,c,d) = 1` for some (a,b,c,d). + Rather, :math:`D(a,b,c,d) < 1`, but + :math:`\lim_{b \to \infty, c \to \infty} D(a,b,c,d) = 0` for :math:`a = 0`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuVI} = \frac{b+c}{a+b+c+1} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuVI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuVI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu VI distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu VI distance + + Examples + -------- + >>> cmp = BaulieuVI() + >>> cmp.dist('cat', 'hat') + 0.5714285714285714 + >>> cmp.dist('Niall', 'Neil') + 0.7 + >>> cmp.dist('aluminum', 'Catalan') + 0.8823529411764706 + >>> cmp.dist('ATCG', 'TAGC') + 0.9090909090909091 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return (b + c) / (a + b + c + 1) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_vii.py b/abydos/distance/_baulieu_vii.py new file mode 100644 index 000000000..595957c95 --- /dev/null +++ b/abydos/distance/_baulieu_vii.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_vii. + +Baulieu VII distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuVII'] + + +class BaulieuVII(_TokenDistance): + r"""Baulieu VII distance. + + For two sets X and Y and a population N, Baulieu VII distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuVII}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|} + {|N| + |X \cap Y| \cdot (|X \cap Y| - 4)^2} + + This is Baulieu's 25th dissimilarity coefficient. This coefficient fails + Baulieu's (P4) property, that :math:`D(a+1,b,c,d) \leq D(a,b,c,d) = 0` + with equality holding iff :math:`D(a,b,c,d) = 0`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuVII} = \frac{b+c}{n + a \cdot (a-4)^2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuVII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuVII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu VII distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu VII distance + + Examples + -------- + >>> cmp = BaulieuVII() + >>> cmp.dist('cat', 'hat') + 0.005050505050505051 + >>> cmp.dist('Niall', 'Neil') + 0.008838383838383838 + >>> cmp.dist('aluminum', 'Catalan') + 0.018891687657430732 + >>> cmp.dist('ATCG', 'TAGC') + 0.012755102040816327 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + bpc = self._src_only_card() + self._tar_only_card() + n = self._population_unique_card() + + return bpc / (n + a * (a - 4) ** 2) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_viii.py b/abydos/distance/_baulieu_viii.py new file mode 100644 index 000000000..928e5553c --- /dev/null +++ b/abydos/distance/_baulieu_viii.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_viii. + +Baulieu VIII distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuVIII'] + + +class BaulieuVIII(_TokenDistance): + r"""Baulieu VIII distance. + + For two sets X and Y and a population N, Baulieu VIII distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuVIII}(X, Y) = \frac{(|X \setminus Y| - + |Y \setminus X|)^2}{|N|^2} + + This is Baulieu's 26th dissimilarity coefficient. This coefficient fails + Baulieu's (P5) property, that :math:`D(a,b+1,c,d) \geq D(a,b,c,d)`, + with equality holding if :math:`D(a,b,c,d) = 1`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuVIII} = \frac{(b-c)^2}{n^2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuVIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuVIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu VIII distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu VIII distance + + Examples + -------- + >>> cmp = BaulieuVIII() + >>> cmp.dist('cat', 'hat') + 0.0 + >>> cmp.dist('Niall', 'Neil') + 1.6269262807163682e-06 + >>> cmp.dist('aluminum', 'Catalan') + 1.6227838857560144e-06 + >>> cmp.dist('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + bmc = self._src_only_card() - self._tar_only_card() + n = self._population_unique_card() + + if bmc == 0.0: + return 0.0 + return bmc ** 2 / n ** 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_x.py b/abydos/distance/_baulieu_x.py new file mode 100644 index 000000000..218a1a68c --- /dev/null +++ b/abydos/distance/_baulieu_x.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_x. + +Baulieu X distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuX'] + + +class BaulieuX(_TokenDistance): + r"""Baulieu X distance. + + For two sets X and Y and a population N, Baulieu X distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuX}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| + + max(|X \setminus Y|, |Y \setminus X|)}{|N| + + max(|X \setminus Y|, |Y \setminus X|)} + + This is Baulieu's 28th dissimilarity coefficient. This coefficient fails + Baulieu's (P8) property, that :math:`D` is a rational function whose + numerator and denominator are both (total) linear. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuX} = \frac{b+c+max(b,c)}{n+max(b,c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuX instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuX, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu X distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu X distance + + Examples + -------- + >>> cmp = BaulieuX() + >>> cmp.dist('cat', 'hat') + 0.007633587786259542 + >>> cmp.dist('Niall', 'Neil') + 0.013959390862944163 + >>> cmp.dist('aluminum', 'Catalan') + 0.029003783102143757 + >>> cmp.dist('ATCG', 'TAGC') + 0.019011406844106463 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + num = b + c + max(b, c) + + if num == 0.0: + return 0.0 + return num / (n + max(b, c)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_xi.py b/abydos/distance/_baulieu_xi.py new file mode 100644 index 000000000..09558f947 --- /dev/null +++ b/abydos/distance/_baulieu_xi.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_xi. + +Baulieu XI distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuXI'] + + +class BaulieuXI(_TokenDistance): + r"""Baulieu XI distance. + + For two sets X and Y and a population N, Baulieu XI distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuXI}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|} + {|X \setminus Y| + |Y \setminus X| + |(N \setminus X) \setminus Y|} + + This is Baulieu's 29th dissimilarity coefficient. This coefficient fails + Baulieu's (P4) property, that :math:`D(a+1,b,c,d) \leq D(a,b,c,d) = 0` + with equality holding iff :math:`D(a,b,c,d) = 0`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuXI} = \frac{b+c}{b+c+d} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuXI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuXI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu XI distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu XI distance + + Examples + -------- + >>> cmp = BaulieuXI() + >>> cmp.dist('cat', 'hat') + 0.005115089514066497 + >>> cmp.dist('Niall', 'Neil') + 0.008951406649616368 + >>> cmp.dist('aluminum', 'Catalan') + 0.01913265306122449 + >>> cmp.dist('ATCG', 'TAGC') + 0.012755102040816327 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + bpc = self._src_only_card() + self._tar_only_card() + d = self._total_complement_card() + + if bpc: + return bpc / (bpc + d) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_xii.py b/abydos/distance/_baulieu_xii.py new file mode 100644 index 000000000..5a6d13af3 --- /dev/null +++ b/abydos/distance/_baulieu_xii.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_xii. + +Baulieu XII distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuXII'] + + +class BaulieuXII(_TokenDistance): + r"""Baulieu XII distance. + + For two sets X and Y and a population N, Baulieu XII distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuXII}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|} + {|X \cap Y| + |X \setminus Y| + |Y \setminus X| - 1} + + This is Baulieu's 30th dissimilarity coefficient. This coefficient fails + Baulieu's (P5) property, that :math:`D(a,b+1,c,d) \geq D(a,b,c,d)`, + with equality holding if :math:`D(a,b,c,d) = 1`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuXII} = \frac{b+c}{a+b+c-1} + + Notes + ----- + In the special case of comparisons where the intersection (a) contains 0 + members, the size of the intersection is set to 1, resulting in a distance + of 1.0. This prevents the distance from exceeding 1.0 and similarity from + becoming negative. + + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuXII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuXII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu XII distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu XII distance + + Examples + -------- + >>> cmp = BaulieuXII() + >>> cmp.dist('cat', 'hat') + 0.8 + >>> cmp.dist('Niall', 'Neil') + 0.875 + >>> cmp.dist('aluminum', 'Catalan') + 1.0 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = max(1.0, self._intersection_card()) + bpc = self._src_only_card() + self._tar_only_card() + + if bpc == 0.0: + return 0.0 + return bpc / (a + bpc - 1) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_xiii.py b/abydos/distance/_baulieu_xiii.py new file mode 100644 index 000000000..c5ab5ea56 --- /dev/null +++ b/abydos/distance/_baulieu_xiii.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_xiii. + +Baulieu XIII distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuXIII'] + + +class BaulieuXIII(_TokenDistance): + r"""Baulieu XIII distance. + + For two sets X and Y and a population N, Baulieu XIII distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuXIII}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|} + {|X \cap Y| + |X \setminus Y| + |Y \setminus X| + |X \cap Y| \cdot + (|X \cap Y| - 4)^2} + + This is Baulieu's 31st dissimilarity coefficient. This coefficient fails + Baulieu's (P4) property, that :math:`D(a+1,b,c,d) \leq D(a,b,c,d) = 0` + with equality holding iff :math:`D(a,b,c,d) = 0`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuXIII} = \frac{b+c}{a+b+c+a \cdot (a-4)^2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuXIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuXIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu XIII distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu XIII distance + + Examples + -------- + >>> cmp = BaulieuXIII() + >>> cmp.dist('cat', 'hat') + 0.2857142857142857 + >>> cmp.dist('Niall', 'Neil') + 0.4117647058823529 + >>> cmp.dist('aluminum', 'Catalan') + 0.6 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + bpc = self._src_only_card() + self._tar_only_card() + + if bpc == 0.0: + return 0.0 + return bpc / (a + bpc + a * (a - 4) ** 2) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_xiv.py b/abydos/distance/_baulieu_xiv.py new file mode 100644 index 000000000..9d19d47f4 --- /dev/null +++ b/abydos/distance/_baulieu_xiv.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_xiv. + +Baulieu XIV distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuXIV'] + + +class BaulieuXIV(_TokenDistance): + r"""Baulieu XIV distance. + + For two sets X and Y and a population N, Baulieu XIV distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuXIV}(X, Y) = \frac{|X \setminus Y| + 2 \cdot + |Y \setminus X|}{|X \cap Y| + |X \setminus Y| + 2 \cdot + |Y \setminus X|} + + This is Baulieu's 32nd dissimilarity coefficient. This coefficient fails + Baulieu's (P7) property, that :math:`D(a,b,c,d) = D(a,c,b,d)`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuXIV} = \frac{b+2c}{a+b+2c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuXIV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuXIV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu XIV distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu XIV distance + + Examples + -------- + >>> cmp = BaulieuXIV() + >>> cmp.dist('cat', 'hat') + 0.75 + >>> cmp.dist('Niall', 'Neil') + 0.8333333333333334 + >>> cmp.dist('aluminum', 'Catalan') + 0.9565217391304348 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return (b + 2 * c) / (a + b + 2 * c) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baulieu_xv.py b/abydos/distance/_baulieu_xv.py new file mode 100644 index 000000000..3e5e31072 --- /dev/null +++ b/abydos/distance/_baulieu_xv.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._baulieu_xv. + +Baulieu XV distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BaulieuXV'] + + +class BaulieuXV(_TokenDistance): + r"""Baulieu XV distance. + + For two sets X and Y and a population N, Baulieu XV distance + :cite:`Baulieu:1997` is + + .. math:: + + dist_{BaulieuXV}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| + + max(|X \setminus Y|, |Y \setminus X|)}{|X \cap Y| + |X \setminus Y| + + |Y \setminus X| + max(|X \setminus Y|, |Y \setminus X|)} + + This is Baulieu's 33rd dissimilarity coefficient. This coefficient fails + Baulieu's (P8) property, that :math:`D` is a rational function whose + numerator and denominator are both (total) linear. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{BaulieuXV} = \frac{b+c+max(b, c)}{a+b+c+max(b, c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BaulieuXV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BaulieuXV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Baulieu XV distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Baulieu XV distance + + Examples + -------- + >>> cmp = BaulieuXV() + >>> cmp.dist('cat', 'hat') + 0.75 + >>> cmp.dist('Niall', 'Neil') + 0.8461538461538461 + >>> cmp.dist('aluminum', 'Catalan') + 0.9583333333333334 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return (b + c + max(b, c)) / (a + b + c + max(b, c)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_baystat.py b/abydos/distance/_baystat.py index 2806339aa..7a307f2fc 100644 --- a/abydos/distance/_baystat.py +++ b/abydos/distance/_baystat.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ __all__ = ['Baystat', 'dist_baystat', 'sim_baystat'] @@ -44,9 +47,36 @@ class Baystat(_Distance): This is ostensibly a port of the R module PPRL's implementation: https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp :cite:`Rukasz:2018`. As such, this could be made more pythonic. + + .. versionadded:: 0.3.6 """ - def sim(self, src, tar, min_ss_len=None, left_ext=None, right_ext=None): + def __init__( + self, min_ss_len=None, left_ext=None, right_ext=None, **kwargs + ): + """Initialize Levenshtein instance. + + Parameters + ---------- + min_ss_len : int + Minimum substring length to be considered + left_ext : int + Left-side extension length + right_ext : int + Right-side extension length + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Baystat, self).__init__(**kwargs) + self._min_ss_len = min_ss_len + self._left_ext = left_ext + self._right_ext = right_ext + + def sim(self, src, tar): """Return the Baystat similarity. Parameters @@ -55,12 +85,6 @@ def sim(self, src, tar, min_ss_len=None, left_ext=None, right_ext=None): Source string for comparison tar : str Target string for comparison - min_ss_len : int - Minimum substring length to be considered - left_ext :int - Left-side extension length - right_ext :int - Right-side extension length Returns ------- @@ -79,6 +103,11 @@ def sim(self, src, tar, min_ss_len=None, left_ext=None, right_ext=None): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 1.0 @@ -87,7 +116,7 @@ def sim(self, src, tar, min_ss_len=None, left_ext=None, right_ext=None): max_len = max(len(src), len(tar)) - if not (min_ss_len and left_ext and right_ext): + if not (self._min_ss_len and self._left_ext and self._right_ext): # These can be set via arguments to the function. Otherwise they # are set automatically based on values from the article. if max_len >= 7: @@ -101,6 +130,10 @@ def sim(self, src, tar, min_ss_len=None, left_ext=None, right_ext=None): min_ss_len = 1 left_ext = 0 right_ext = 0 + else: + min_ss_len = self._min_ss_len + left_ext = self._left_ext + right_ext = self._right_ext pos = 0 match_len = 0 @@ -173,6 +206,12 @@ def sim(self, src, tar, min_ss_len=None, left_ext=None, right_ext=None): pos += ix +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Baystat.sim method instead.', +) def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): """Return the Baystat similarity. @@ -207,10 +246,18 @@ def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): >>> sim_baystat('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.3.0 + """ - return Baystat().sim(src, tar, min_ss_len, left_ext, right_ext) + return Baystat(min_ss_len, left_ext, right_ext).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Baystat.dist method instead.', +) def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): """Return the Baystat distance. @@ -245,8 +292,10 @@ def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): >>> dist_baystat('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.3.0 + """ - return Baystat().dist(src, tar, min_ss_len, left_ext, right_ext) + return Baystat(min_ss_len, left_ext, right_ext).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_benini_i.py b/abydos/distance/_benini_i.py new file mode 100644 index 000000000..9576643b4 --- /dev/null +++ b/abydos/distance/_benini_i.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._benini_i. + +Benini I correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BeniniI'] + + +class BeniniI(_TokenDistance): + r"""BeniniI correlation. + + For two sets X and Y and a population N, Benini I correlation, Benini's + Index of Attraction, :cite:`Benini:1901` is + + .. math:: + + corr_{BeniniI}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|}{|Y| \cdot |N \setminus X|} + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{BeniniI} = \frac{ad-bc}{(a+c)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BeniniI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BeniniI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Benini I correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Benini I correlation + + Examples + -------- + >>> cmp = BeniniI() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.3953727506426735 + >>> cmp.corr('aluminum', 'Catalan') + 0.11485180412371133 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d - b * c + + if num == 0.0: + return 0.0 + return num / ((a + c) * (c + d)) + + def sim(self, src, tar): + """Return the Benini I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Benini I similarity + + Examples + -------- + >>> cmp = BeniniI() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6976863753213367 + >>> cmp.sim('aluminum', 'Catalan') + 0.5574259020618557 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1 + self.corr(src, tar)) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_benini_ii.py b/abydos/distance/_benini_ii.py new file mode 100644 index 000000000..0adf4c472 --- /dev/null +++ b/abydos/distance/_benini_ii.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._benini_ii. + +Benini II correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BeniniII'] + + +class BeniniII(_TokenDistance): + r"""BeniniII correlation. + + For two sets X and Y and a population N, Benini II correlation, Benini's + Index of Repulsion, :cite:`Benini:1901` is + + .. math:: + + corr_{BeniniII}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {min(|Y| \cdot |N \setminus X|, |X| \cdot |N \setminus Y|} + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{BeniniII} = \frac{ad-bc}{min((a+c)(c+d), (a+b)(b+d))} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BeniniII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BeniniII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Benini II correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Benini II correlation + + Examples + -------- + >>> cmp = BeniniII() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.3953727506426735 + >>> cmp.corr('aluminum', 'Catalan') + 0.11485180412371133 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d - b * c + + if num == 0.0: + return 0.0 + + bc_min = min(b, c) + return num / ((a + bc_min) * (bc_min + d)) + + def sim(self, src, tar): + """Return the Benini II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Benini II similarity + + Examples + -------- + >>> cmp = BeniniII() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6976863753213367 + >>> cmp.sim('aluminum', 'Catalan') + 0.5574259020618557 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1 + self.corr(src, tar)) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_bennet.py b/abydos/distance/_bennet.py new file mode 100644 index 000000000..af48b39f4 --- /dev/null +++ b/abydos/distance/_bennet.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._bennet. + +Bennet's S correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Bennet'] + + +class Bennet(_TokenDistance): + r"""Bennet's S correlation. + + For two sets X and Y and a population N, Bennet's :math:`S` + correlation :cite:`Bennet:1954` is + + .. math:: + + corr_{Bennet}(X, Y) = S = + \frac{p_o - p_e^S}{1 - p_e^S} + + where + + .. math:: + + p_o = \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|} + + p_e^S = \frac{1}{2} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + p_o = \frac{a+d}{n} + + p_e^S = \frac{1}{2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Bennet instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Bennet, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Bennet's S correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Bennet's S correlation + + Examples + -------- + >>> cmp = Bennet() + >>> cmp.corr('cat', 'hat') + 0.989795918367347 + >>> cmp.corr('Niall', 'Neil') + 0.9821428571428572 + >>> cmp.corr('aluminum', 'Catalan') + 0.9617834394904459 + >>> cmp.corr('ATCG', 'TAGC') + 0.9744897959183674 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return 2 * (a + d) / n - 1 + + def sim(self, src, tar): + """Return the Bennet's S similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Bennet's S similarity + + Examples + -------- + >>> cmp = Bennet() + >>> cmp.sim('cat', 'hat') + 0.9948979591836735 + >>> cmp.sim('Niall', 'Neil') + 0.9910714285714286 + >>> cmp.sim('aluminum', 'Catalan') + 0.9808917197452229 + >>> cmp.sim('ATCG', 'TAGC') + 0.9872448979591837 + + + .. versionadded:: 0.4.0 + + """ + return (1 + self.corr(src, tar)) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_bhattacharyya.py b/abydos/distance/_bhattacharyya.py new file mode 100644 index 000000000..64a672079 --- /dev/null +++ b/abydos/distance/_bhattacharyya.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._bhattacharyya. + +Bhattacharyya distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['Bhattacharyya'] + + +class Bhattacharyya(_TokenDistance): + r"""Bhattacharyya distance. + + For two multisets X and Y drawn from an alphabet S, Bhattacharyya distance + :cite:`Bhattacharyya:1946` is + + .. math:: + + dist_{Bhattacharyya}(X, Y) = + -log(\sum_{i \in S} \sqrt{X_iY_i}) + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Bhattacharyya instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Bhattacharyya, self).__init__(tokenizer=tokenizer, **kwargs) + + def dist_abs(self, src, tar): + """Return the Bhattacharyya distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Bhattacharyya distance + + Examples + -------- + >>> cmp = Bhattacharyya() + >>> cmp.dist_abs('cat', 'hat') + 0.6931471805599453 + >>> cmp.dist_abs('Niall', 'Neil') + 1.0074515102711326 + >>> cmp.dist_abs('aluminum', 'Catalan') + 2.1383330595080277 + >>> cmp.dist_abs('ATCG', 'TAGC') + -inf + + + .. versionadded:: 0.4.0 + + """ + bc = self.dist(src, tar) + if bc == 0: + return float('-inf') + elif bc == 1: + return 0.0 + else: + return -log(bc) + + def dist(self, src, tar): + """Return the Bhattacharyya coefficient of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Bhattacharyya distance + + Examples + -------- + >>> cmp = Bhattacharyya() + >>> cmp.dist('cat', 'hat') + 0.5 + >>> cmp.dist('Niall', 'Neil') + 0.3651483716701107 + >>> cmp.dist('aluminum', 'Catalan') + 0.11785113019775792 + >>> cmp.dist('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._intersection().keys() + src_pop = sum(self._src_tokens.values()) + tar_pop = sum(self._tar_tokens.values()) + + return float( + sum( + ( + self._src_tokens[tok] + / src_pop + * self._tar_tokens[tok] + / tar_pop + ) + ** 0.5 + for tok in alphabet + ) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_bisim.py b/abydos/distance/_bisim.py new file mode 100644 index 000000000..84ee48f01 --- /dev/null +++ b/abydos/distance/_bisim.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._bisim. + +BI-SIM similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from numpy import float as np_float +from numpy import zeros as np_zeros + +from ._distance import _Distance + +__all__ = ['BISIM'] + + +class BISIM(_Distance): + r"""BI-SIM similarity. + + BI-SIM similarity :cite:`Kondrak:2003` is an n-gram based, edit-distance + derived similarity measure. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, qval=2, **kwargs): + """Initialize BISIM instance. + + Parameters + ---------- + qval : int + The number of characters to consider in each n-gram (q-gram). By + default this is 2, hence BI-SIM. But TRI-SIM can be calculated by + setting this to 3. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(BISIM, self).__init__(**kwargs) + self._qval = qval + + def sim(self, src, tar): + """Return the BI-SIM similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + BI-SIM similarity + + Examples + -------- + >>> cmp = BISIM() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.4 + >>> cmp.sim('aluminum', 'Catalan') + 0.3125 + >>> cmp.sim('ATCG', 'TAGC') + 0.375 + + + .. versionadded:: 0.4.0 + + """ + src_len = len(src) + tar_len = len(tar) + + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + def _id(src_pos, tar_pos): + s = 0 + for i in range(self._qval): + s += int(src[src_pos + i] == tar[tar_pos + i]) + return s / self._qval + + src = src[0].swapcase() * (self._qval - 1) + src + tar = tar[0].swapcase() * (self._qval - 1) + tar + + d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) + + for i in range(1, src_len + 1): + for j in range(1, tar_len + 1): + d_mat[i, j] = max( + d_mat[i - 1, j - 1] + _id(i - 1, j - 1), # sub/== + d_mat[i - 1, j], # ins + d_mat[i, j - 1], # del + ) + return d_mat[src_len, tar_len] / max(src_len, tar_len) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_bleu.py b/abydos/distance/_bleu.py new file mode 100644 index 000000000..da82b4c45 --- /dev/null +++ b/abydos/distance/_bleu.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._bleu. + +BLEU similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import exp, log + +from ._distance import _Distance +from ..tokenizer import QGrams + +__all__ = ['BLEU'] + + +class BLEU(_Distance): + r"""BLEU similarity. + + BLEU similarity :cite:`Papineni:2002` compares two strings for similarity + using a set of tokenizers and a brevity penalty: + + .. math:: + + BP = + \left\{ + \begin{array}{lrl} + 1 & \textup{if} & c > r \\ + e^{(1-\frac{r}{c})} & \textup{if} & c \leq r + \end{array} + \right. + + The BLEU score is then: + + .. math:: + + \textup{B\textsc{leu}} = BP \cdot e^{\sum_{n=1}^N w_n log p_n} + + For tokenizers 1 to N, by default q-gram tokenizers for q=1 to N in + Abydos, weights :math:`w_n`, which are uniformly :math:`\frac{1}{N}`, + and :math:`p_n`: + + .. math:: + + p_n = \frac{\sum_{token \in tar} min(Count(token \in tar), + Count(token \in src))}{|tar|} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, n_min=1, n_max=4, tokenizers=None, weights=None, **kwargs + ): + """Initialize BLEU instance. + + Parameters + ---------- + n_min : int + The minimum q-gram value for BLEU score calculation (1 by default) + n_max : int + The maximum q-gram value for BLEU score calculation (4 by default) + tokenizers : list(_Tokenizer) + A list of initialized tokenizers + weights : list(float) + A list of floats representing the weights of the tokenizers. If + tokenizers is set, this must have the same length. If n_min and + n_max are used to set tokenizers, this must have length equal to + n_max-n_min-1. Otherwise, uniform weights will be used. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(BLEU, self).__init__(**kwargs) + self._tokenizers = ( + [QGrams(qval=n, start_stop='') for n in range(n_min, n_max + 1)] + if tokenizers is None + else tokenizers + ) + self._weights = weights + if not weights or len(weights) != len(self._tokenizers): + self._weights = [ + 1.0 / len(self._tokenizers) + for _ in range(len(self._tokenizers)) + ] + + def sim(self, src, tar): + """Return the BLEU similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + BLEU similarity + + Examples + -------- + >>> cmp = BLEU() + >>> cmp.sim('cat', 'hat') + 0.7598356856515925 + >>> cmp.sim('Niall', 'Neil') + 0.7247557929987696 + >>> cmp.sim('aluminum', 'Catalan') + 0.44815260192961937 + >>> cmp.sim('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if not src or not tar: + return 0.0 + + brevity_penalty = ( + 1.0 if len(tar) >= len(src) else exp(1 - len(src) / len(tar)) + ) + + bleu_sum = 0.0 + bleu_null = True + + for i in range(len(self._tokenizers)): + tar_tokens = self._tokenizers[i].tokenize(tar).get_counter() + tokens_int = ( + self._tokenizers[i].tokenize(src).get_counter() & tar_tokens + ) + tar_total = sum(tar_tokens.values()) + + if tokens_int: + bleu_null = False + bleu_sum += ( + log(sum(tokens_int.values()) / tar_total) + * self._weights[i] + ) + + if bleu_null: + return 0.0 + + return brevity_penalty * exp(bleu_sum) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_block_levenshtein.py b/abydos/distance/_block_levenshtein.py new file mode 100644 index 000000000..88470f2a4 --- /dev/null +++ b/abydos/distance/_block_levenshtein.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._block_levenshtein. + +Levenshtein distance with block operations +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._lcsstr import LCSstr +from ._levenshtein import Levenshtein + +__all__ = ['BlockLevenshtein'] + + +class BlockLevenshtein(Levenshtein): + """Levenshtein distance with block operations. + + In addition to character-level insert, delete, and replace operations, + this version of the Levenshtein distance supports block-level insert, + delete, and replace, provided that the block occurs in both input + strings. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, cost=(1, 1, 1, 1), normalizer=max, **kwargs): + """Initialize BlockLevenshtein instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(BlockLevenshtein, self).__init__( + cost=cost, normalizer=normalizer, **kwargs + ) + self.lcs = LCSstr() + + def dist_abs(self, src, tar): + """Return the block Levenshtein edit distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int + The block Levenshtein edit distance between src & tar + + Examples + -------- + >>> cmp = BlockLevenshtein() + >>> cmp.dist_abs('cat', 'hat') + 1 + >>> cmp.dist_abs('Niall', 'Neil') + 3 + >>> cmp.dist_abs('aluminum', 'Catalan') + 7 + >>> cmp.dist_abs('ATCG', 'TAGC') + 3 + + + .. versionadded:: 0.4.0 + + """ + alphabet = set(src) | set(tar) + next_char = ord('A') + lcs = self.lcs.lcsstr(src, tar) + while len(lcs) > 1: + while chr(next_char) in alphabet: + next_char += 1 + p = self.lcs.lcsstr(src, tar) + src = src.replace(p, chr(next_char)) + tar = tar.replace(p, chr(next_char)) + alphabet.add(chr(next_char)) + lcs = self.lcs.lcsstr(src, tar) + d = super(BlockLevenshtein, self).dist_abs(src, tar) + return d + + def dist(self, src, tar): + """Return the normalized block Levenshtein distance between strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The normalized Levenshtein distance with blocks between src & tar + + Examples + -------- + >>> cmp = BlockLevenshtein() + >>> round(cmp.dist('cat', 'hat'), 12) + 0.333333333333 + >>> round(cmp.dist('Niall', 'Neil'), 12) + 0.6 + >>> cmp.dist('aluminum', 'Catalan') + 0.875 + >>> cmp.dist('ATCG', 'TAGC') + 0.75 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + ins_cost, del_cost = self._cost[:2] + return self.dist_abs(src, tar) / ( + self._normalizer([len(src) * del_cost, len(tar) * ins_cost]) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_brainerd_robinson.py b/abydos/distance/_brainerd_robinson.py new file mode 100644 index 000000000..961179fad --- /dev/null +++ b/abydos/distance/_brainerd_robinson.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._brainerd_robinson. + +Brainerd-Robinson similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BrainerdRobinson'] + + +class BrainerdRobinson(_TokenDistance): + r"""Brainerd-Robinson similarity. + + For two multisets X and Y drawn from an alphabet S, Brainerd-Robinson + similarity :cite:`Robinson:1951,Brainerd:1951` is + + .. math:: + + sim_{BrainerdRobinson}(X, Y) = + 200 - 100 \cdot \sum_{i \in S} |\frac{X_i}{\sum_{i \in S} |X_i|} - + \frac{Y_i}{\sum_{i \in S} |Y_i|}| + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize BrainerdRobinson instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(BrainerdRobinson, self).__init__(tokenizer=tokenizer, **kwargs) + + def sim_score(self, src, tar): + """Return the Brainerd-Robinson similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Brainerd-Robinson similarity + + Examples + -------- + >>> cmp = BrainerdRobinson() + >>> cmp.sim_score('cat', 'hat') + 100.0 + >>> cmp.sim_score('Niall', 'Neil') + 66.66666666666669 + >>> cmp.sim_score('aluminum', 'Catalan') + 22.2222222222222 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._total().keys() + src_card = max(1, self._src_card()) + tar_card = max(1, self._tar_card()) + + score = 200.0 - 100.0 * sum( + abs( + self._src_tokens[tok] / src_card + - self._tar_tokens[tok] / tar_card + ) + for tok in alphabet + ) + if score < 1e-13: + score = 0.0 + + return score + + def sim(self, src, tar): + """Return the normalized Brainerd-Robinson similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Brainerd-Robinson similarity + + Examples + -------- + >>> cmp = BrainerdRobinson() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3333333333333334 + >>> cmp.sim('aluminum', 'Catalan') + 0.111111111111111 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return self.sim_score(src, tar) / 200.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_braun_blanquet.py b/abydos/distance/_braun_blanquet.py new file mode 100644 index 000000000..b064811d1 --- /dev/null +++ b/abydos/distance/_braun_blanquet.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._braun_blanquet. + +Braun-Blanquet similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['BraunBlanquet'] + + +class BraunBlanquet(_TokenDistance): + r"""Braun-Blanquet similarity. + + For two sets X and Y and a population N, the Braun-Blanquet + similarity :cite:`BraunBlanquet:1932` is + + .. math:: + + sim_{BraunBlanquet}(X, Y) = \frac{|X \cap Y|}{max(|X|, |Y|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{BraunBlanquet} = + \frac{a}{max(a+b, a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize BraunBlanquet instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(BraunBlanquet, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Braun-Blanquet similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Braun-Blanquet similarity + + Examples + -------- + >>> cmp = BraunBlanquet() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3333333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.1111111111111111 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + return self._intersection_card() / max( + self._src_card(), self._tar_card() + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_canberra.py b/abydos/distance/_canberra.py new file mode 100644 index 000000000..0e666660b --- /dev/null +++ b/abydos/distance/_canberra.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._canberra. + +Canberra distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Canberra'] + + +class Canberra(_TokenDistance): + r"""Canberra distance. + + For two sets X and Y, the Canberra distance :cite:`Lance:1966,Lance:1967b` + is + + .. math:: + + sim_{Canberra}(X, Y) = \frac{|X \triangle Y|}{|X|+|Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Canberra} = + \frac{b+c}{(a+b)+(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Canberra instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Canberra, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def dist(self, src, tar): + """Return the Canberra distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Canberra distance + + Examples + -------- + >>> cmp = Canberra() + >>> cmp.dist('cat', 'hat') + 0.5 + >>> cmp.dist('Niall', 'Neil') + 0.6363636363636364 + >>> cmp.dist('aluminum', 'Catalan') + 0.8823529411764706 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + return self._symmetric_difference_card() / self._total_card() + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_chebyshev.py b/abydos/distance/_chebyshev.py index 038fc7804..83508dfe4 100644 --- a/abydos/distance/_chebyshev.py +++ b/abydos/distance/_chebyshev.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._minkowski import Minkowski +from .. import __version__ __all__ = ['Chebyshev', 'chebyshev'] @@ -38,9 +41,54 @@ class Chebyshev(Minkowski): Euclidean distance is the chessboard distance, equivalent to Minkowski distance in :math:`L^\infty`-space. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, qval=2, alphabet=None): + def __init__( + self, alphabet=0, tokenizer=None, intersection_type='crisp', **kwargs + ): + """Initialize Euclidean instance. + + Parameters + ---------- + alphabet : collection or int + The values or size of the alphabet + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Chebyshev, self).__init__( + pval=float('inf'), + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist_abs(self, src, tar): r"""Return the Chebyshev distance between two strings. Parameters @@ -49,10 +97,6 @@ def dist_abs(self, src, tar, qval=2, alphabet=None): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version alphabet - alphabet : collection or int - The values or size of the alphabet Returns ------- @@ -70,15 +114,20 @@ def dist_abs(self, src, tar, qval=2, alphabet=None): 1.0 >>> cmp.dist_abs('ATCG', 'TAGC') 1.0 - >>> cmp.dist_abs('ATCG', 'TAGC', qval=1) + + >>> cmp = Chebyshev(qval=1) + >>> cmp.dist_abs('ATCG', 'TAGC') 0.0 - >>> cmp.dist_abs('ATCGATTCGGAATTTC', 'TAGCATAATCGCCG', qval=1) + >>> cmp.dist_abs('ATCGATTCGGAATTTC', 'TAGCATAATCGCCG') 3.0 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return super(self.__class__, self).dist_abs( - src, tar, qval, float('inf'), False, alphabet - ) + return super(Chebyshev, self).dist_abs(src, tar, False) def sim(self, *args, **kwargs): """Raise exception when called. @@ -95,6 +144,9 @@ def sim(self, *args, **kwargs): NotImplementedError Method disabled for Chebyshev distance + + .. versionadded:: 0.3.6 + """ raise NotImplementedError('Method disabled for Chebyshev distance.') @@ -113,11 +165,20 @@ def dist(self, *args, **kwargs): NotImplementedError Method disabled for Chebyshev distance + + .. versionadded:: 0.3.6 + """ raise NotImplementedError('Method disabled for Chebyshev distance.') -def chebyshev(src, tar, qval=2, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Chebyshev.dist_abs method instead.', +) +def chebyshev(src, tar, qval=2, alphabet=0): r"""Return the Chebyshev distance between two strings. This is a wrapper for the :py:meth:`Chebyshev.dist_abs`. @@ -129,7 +190,7 @@ def chebyshev(src, tar, qval=2, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version alphabet + The length of each q-gram alphabet : collection or int The values or size of the alphabet @@ -153,8 +214,10 @@ def chebyshev(src, tar, qval=2, alphabet=None): >>> chebyshev('ATCGATTCGGAATTTC', 'TAGCATAATCGCCG', qval=1) 3.0 + .. versionadded:: 0.3.0 + """ - return Chebyshev().dist_abs(src, tar, qval, alphabet) + return Chebyshev(alphabet=alphabet, qval=qval).dist_abs(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_chord.py b/abydos/distance/_chord.py new file mode 100644 index 000000000..b63d2e3d0 --- /dev/null +++ b/abydos/distance/_chord.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._chord. + +Chord distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Chord'] + + +class Chord(_TokenDistance): + r"""Chord distance. + + For two sets X and Y drawn from an alphabet S, the chord distance + :cite:`Orloci:1967` is + + .. math:: + + sim_{chord}(X, Y) = + \sqrt{\sum_{i \in S}\Big(\frac{X_i}{\sqrt{\sum_{j \in X} X_j^2}} - + \frac{Y_i}{\sqrt{\sum_{j \in Y} Y_j^2}}\Big)^2} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Chord instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Chord, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def dist_abs(self, src, tar): + """Return the Chord distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Chord distance + + Examples + -------- + >>> cmp = Chord() + >>> cmp.dist_abs('cat', 'hat') + 1.0 + >>> cmp.dist_abs('Niall', 'Neil') + 1.126811100699571 + >>> cmp.dist_abs('aluminum', 'Catalan') + 1.336712116966249 + >>> cmp.dist_abs('ATCG', 'TAGC') + 1.414213562373095 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._total().keys() + + den1 = max( + 1, sum(val * val for val in self._src_tokens.values()) ** 0.5 + ) + den2 = max( + 1, sum(val * val for val in self._tar_tokens.values()) ** 0.5 + ) + + return round( + sum( + (self._src_tokens[i] / den1 - self._tar_tokens[i] / den2) ** 2 + for i in alphabet + ) + ** 0.5, + 15, + ) + + def dist(self, src, tar): + """Return the normalized Chord distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized chord distance + + Examples + -------- + >>> cmp = Chord() + >>> cmp.dist('cat', 'hat') + 0.707106781186547 + >>> cmp.dist('Niall', 'Neil') + 0.796775770420944 + >>> cmp.dist('aluminum', 'Catalan') + 0.94519820240106 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + return round(self.dist_abs(src, tar) / (2 ** 0.5), 15) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_clement.py b/abydos/distance/_clement.py new file mode 100644 index 000000000..f4e04522f --- /dev/null +++ b/abydos/distance/_clement.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._clement. + +Clement similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Clement'] + + +class Clement(_TokenDistance): + r"""Clement similarity. + + For two sets X and Y and a population N, Clement similarity + :cite:`Clement:1976` is defined as + + .. math:: + + sim_{Clement}(X, Y) = + \frac{|X \cap Y|}{|X|}\Big(1-\frac{|X|}{|N|}\Big) + + \frac{|(N \setminus X) \setminus Y|}{|N \setminus X|} + \Big(1-\frac{|N \setminus X|}{|N|}\Big) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Clement} = + \frac{a}{a+b}\Big(1 - \frac{a+b}{n}\Big) + + \frac{d}{c+d}\Big(1 - \frac{c+d}{n}\Big) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Clement instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Clement, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Clement similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Clement similarity + + Examples + -------- + >>> cmp = Clement() + >>> cmp.sim('cat', 'hat') + 0.5025379382522239 + >>> cmp.sim('Niall', 'Neil') + 0.33840586363079933 + >>> cmp.sim('aluminum', 'Catalan') + 0.12119877280918714 + >>> cmp.sim('ATCG', 'TAGC') + 0.006336616803332366 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + score = 0.0 + if a + b: + score += (a / (a + b)) * (1 - (a + b) / n) + if c + d: + score += (d / (c + d)) * (1 - (c + d) / n) + + return score + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_cohen_kappa.py b/abydos/distance/_cohen_kappa.py new file mode 100644 index 000000000..87d3d7885 --- /dev/null +++ b/abydos/distance/_cohen_kappa.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._cohen_kappa. + +Cohen's Kappa similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['CohenKappa'] + + +class CohenKappa(_TokenDistance): + r"""Cohen's Kappa similarity. + + For two sets X and Y and a population N, Cohen's \kappa similarity + :cite:`Cohen:1960` is + + .. math:: + + sim_{Cohen_\kappa}(X, Y) = \kappa = + \frac{p_o - p_e^\kappa}{1 - p_e^\kappa} + + where + + .. math:: + + \begin{array}{l} + p_o = \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}\\ + \\ + p_e^\kappa = \frac{|X|}{|N|} \cdot \frac{|Y|}{|N|} + + \frac{|N \setminus X|}{|N|} \cdot \frac{|N \setminus Y|}{|N|} + \end{array} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + \begin{array}{l} + p_o = \frac{a+d}{n}\\ + \\ + p_e^\kappa = \frac{a+b}{n} \cdot \frac{a+c}{n} + + \frac{c+d}{n} \cdot \frac{b+d}{n} + \end{array} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize CohenKappa instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(CohenKappa, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return Cohen's Kappa similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Cohen's Kappa similarity + + Examples + -------- + >>> cmp = CohenKappa() + >>> cmp.sim('cat', 'hat') + 0.9974358974358974 + >>> cmp.sim('Niall', 'Neil') + 0.9955041746949261 + >>> cmp.sim('aluminum', 'Catalan') + 0.9903412749517064 + >>> cmp.sim('ATCG', 'TAGC') + 0.993581514762516 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if d: + return 2 * d / (b + c + 2 * d) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_cole.py b/abydos/distance/_cole.py new file mode 100644 index 000000000..bf5fd6e86 --- /dev/null +++ b/abydos/distance/_cole.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._cole. + +Cole correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Cole'] + + +class Cole(_TokenDistance): + r"""Cole correlation. + + For two sets X and Y and a population N, the Cole correlation + :cite:`Cole:1949` has three formulae: + + - If :math:`|X \cap Y| \cdot |(N \setminus X) \setminus Y| \geq + |X \setminus Y| \cdot |Y \setminus Y|` then + + .. math:: + + corr_{Cole}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {(|X \cap Y| + |X \setminus Y|) \cdot + (|X \setminus Y| + |(N \setminus X) \setminus Y|)} + + - If :math:`|(N \setminus X) \setminus Y| \geq |X \cap Y|` then + + .. math:: + + corr_{Cole}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {(|X \cap Y| + |X \setminus Y|) \cdot + (|X \cap Y| + |Y \setminus X|)} + + - Otherwise + + .. math:: + + corr_{Cole}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {(|X \setminus Y| + |(N \setminus X) \setminus Y|) \cdot + (|Y \setminus X| + |(N \setminus X) \setminus Y|)} + + Cole terms this measurement the Coefficient of Interspecific Association. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Cole} = + \left\{ + \begin{array}{ll} + \frac{ad-bc}{(a+b)(b+d)} & \textup{if} ~ad \geq bc \\ + \\ + \frac{ad-bc}{(a+b)(a+c)} & \textup{if} ~d \geq a \\ + \\ + \frac{ad-bc}{(b+d)(c+d)} & \textup{otherwise} + \end{array} + \right. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Cole instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Cole, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Cole correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Cole correlation + + Examples + -------- + >>> cmp = Cole() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.3290543431750107 + >>> cmp.corr('aluminum', 'Catalan') + 0.10195910195910196 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + admbc = a * d - b * c + + if admbc == 0.0: + return 0.0 + + if a * d >= b * c: + return admbc / ((a + b) * (b + d)) + if d >= a: + return admbc / ((a + b) * (a + c)) + return admbc / ((b + d) * (c + d)) + + def sim(self, src, tar): + """Return the Cole similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for similarity + tar : str + Target string (or QGrams/Counter objects) for similarity + + Returns + ------- + float + Cole similarity + + Examples + -------- + >>> cmp = Cole() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6645271715875054 + >>> cmp.sim('aluminum', 'Catalan') + 0.550979550979551 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1 + self.corr(src, tar)) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_complete_linkage.py b/abydos/distance/_complete_linkage.py new file mode 100644 index 000000000..3f40126ad --- /dev/null +++ b/abydos/distance/_complete_linkage.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._complete_linkage. + +Complete linkage distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._levenshtein import Levenshtein +from ._token_distance import _TokenDistance + +__all__ = ['CompleteLinkage'] + + +class CompleteLinkage(_TokenDistance): + r"""Complete linkage distance. + + For two multisets X and Y, complete linkage distance + :cite:`Deza:2016` is + + .. math:: + + sim_{CompleteLinkage}(X, Y) = + max_{i \in X, j \in Y} dist(X_i, Y_j) + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, metric=None, **kwargs): + """Initialize CompleteLinkage instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. (Defaults to Levenshtein distance) + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(CompleteLinkage, self).__init__(tokenizer=tokenizer, **kwargs) + if metric is None: + self._metric = Levenshtein() + else: + self._metric = metric + + def dist_abs(self, src, tar): + """Return the complete linkage distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + complete linkage distance + + Examples + -------- + >>> cmp = CompleteLinkage() + >>> cmp.dist_abs('cat', 'hat') + 2 + >>> cmp.dist_abs('Niall', 'Neil') + 2 + >>> cmp.dist_abs('aluminum', 'Catalan') + 2 + >>> cmp.dist_abs('ATCG', 'TAGC') + 2 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + src, tar = self._get_tokens() + + max_val = float('-inf') + + for term_src in src.keys(): + for term_tar in tar.keys(): + max_val = max( + max_val, self._metric.dist_abs(term_src, term_tar) + ) + + return max_val + + def dist(self, src, tar): + """Return the normalized complete linkage distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + normalized complete linkage distance + + Examples + -------- + >>> cmp = CompleteLinkage() + >>> cmp.dist('cat', 'hat') + 1.0 + >>> cmp.dist('Niall', 'Neil') + 1.0 + >>> cmp.dist('aluminum', 'Catalan') + 1.0 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + src, tar = self._get_tokens() + + max_val = 0.0 + + for term_src in src.keys(): + for term_tar in tar.keys(): + max_val = max(max_val, self._metric.dist(term_src, term_tar)) + + return max_val + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_consonni_todeschini_i.py b/abydos/distance/_consonni_todeschini_i.py new file mode 100644 index 000000000..d33520546 --- /dev/null +++ b/abydos/distance/_consonni_todeschini_i.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._consonni_todeschini_i. + +Consonni & Todeschini I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log1p + +from ._token_distance import _TokenDistance + +__all__ = ['ConsonniTodeschiniI'] + + +class ConsonniTodeschiniI(_TokenDistance): + r"""Consonni & Todeschini I similarity. + + For two sets X and Y and a population N, Consonni & Todeschini I similarity + :cite:`Consonni:2012` is + + .. math:: + + sim_{ConsonniTodeschiniI}(X, Y) = + \frac{log(1+|X \cap Y|+|(N \setminus X) \setminus Y|)} + {log(1+|N|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{ConsonniTodeschiniI} = + \frac{log(1+a+d)}{log(1+n)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ConsonniTodeschiniI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ConsonniTodeschiniI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Consonni & Todeschini I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Consonni & Todeschini I similarity + + Examples + -------- + >>> cmp = ConsonniTodeschiniI() + >>> cmp.sim('cat', 'hat') + 0.9992336018090547 + >>> cmp.sim('Niall', 'Neil') + 0.998656222829757 + >>> cmp.sim('aluminum', 'Catalan') + 0.9971098629456009 + >>> cmp.sim('ATCG', 'TAGC') + 0.9980766131469967 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return log1p(a + d) / log1p(n) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_consonni_todeschini_ii.py b/abydos/distance/_consonni_todeschini_ii.py new file mode 100644 index 000000000..6479472fd --- /dev/null +++ b/abydos/distance/_consonni_todeschini_ii.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._consonni_todeschini_ii. + +Consonni & Todeschini II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log1p + +from ._token_distance import _TokenDistance + +__all__ = ['ConsonniTodeschiniII'] + + +class ConsonniTodeschiniII(_TokenDistance): + r"""Consonni & Todeschini II similarity. + + For two sets X and Y and a population N, Consonni & Todeschini II + similarity :cite:`Consonni:2012` is + + .. math:: + + sim_{ConsonniTodeschiniII}(X, Y) = + \frac{log(1+|N|) - log(1+|X \setminus Y|+|Y \setminus X|} + {log(1+|N|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{ConsonniTodeschiniII} = + \frac{log(1+n)-log(1+b+c)}{log(1+n)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ConsonniTodeschiniII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ConsonniTodeschiniII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Consonni & Todeschini II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Consonni & Todeschini II similarity + + Examples + -------- + >>> cmp = ConsonniTodeschiniII() + >>> cmp.sim('cat', 'hat') + 0.7585487129939101 + >>> cmp.sim('Niall', 'Neil') + 0.6880377723094788 + >>> cmp.sim('aluminum', 'Catalan') + 0.5841297898633079 + >>> cmp.sim('ATCG', 'TAGC') + 0.640262668568961 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + return (log1p(n) - log1p(b + c)) / log1p(n) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_consonni_todeschini_iii.py b/abydos/distance/_consonni_todeschini_iii.py new file mode 100644 index 000000000..652b7c1d3 --- /dev/null +++ b/abydos/distance/_consonni_todeschini_iii.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._consonni_todeschini_iii. + +Consonni & Todeschini III similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log1p + +from ._token_distance import _TokenDistance + +__all__ = ['ConsonniTodeschiniIII'] + + +class ConsonniTodeschiniIII(_TokenDistance): + r"""Consonni & Todeschini III similarity. + + For two sets X and Y and a population N, Consonni & Todeschini III + similarity :cite:`Consonni:2012` is + + .. math:: + + sim_{ConsonniTodeschiniIII}(X, Y) = + \frac{log(1+|X \cap Y|)}{log(1+|N|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{ConsonniTodeschiniIII} = + \frac{log(1+a)}{log(1+n)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ConsonniTodeschiniIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ConsonniTodeschiniIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Consonni & Todeschini III similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Consonni & Todeschini III similarity + + Examples + -------- + >>> cmp = ConsonniTodeschiniIII() + >>> cmp.sim('cat', 'hat') + 0.1648161441769704 + >>> cmp.sim('Niall', 'Neil') + 0.1648161441769704 + >>> cmp.sim('aluminum', 'Catalan') + 0.10396755253417303 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + n = self._population_unique_card() + + if src == tar and n <= a: + return 1.0 + + return log1p(a) / log1p(n) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_consonni_todeschini_iv.py b/abydos/distance/_consonni_todeschini_iv.py new file mode 100644 index 000000000..89fde379a --- /dev/null +++ b/abydos/distance/_consonni_todeschini_iv.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._consonni_todeschini_iv. + +Consonni & Todeschini IV similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log1p + +from ._token_distance import _TokenDistance + +__all__ = ['ConsonniTodeschiniIV'] + + +class ConsonniTodeschiniIV(_TokenDistance): + r"""Consonni & Todeschini IV similarity. + + For two sets X and Y and a population N, Consonni & Todeschini IV + similarity :cite:`Consonni:2012` is + + .. math:: + + sim_{ConsonniTodeschiniIV}(X, Y) = + \frac{log(1+|X \cap Y|)}{log(1+|X \cup Y|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{ConsonniTodeschiniIV} = + \frac{log(1+a)}{log(1+a+b+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ConsonniTodeschiniIV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ConsonniTodeschiniIV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Consonni & Todeschini IV similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Consonni & Todeschini IV similarity + + Examples + -------- + >>> cmp = ConsonniTodeschiniIV() + >>> cmp.sim('cat', 'hat') + 0.5645750340535796 + >>> cmp.sim('Niall', 'Neil') + 0.4771212547196623 + >>> cmp.sim('aluminum', 'Catalan') + 0.244650542118226 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return log1p(a) / log1p(a + b + c) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_consonni_todeschini_v.py b/abydos/distance/_consonni_todeschini_v.py new file mode 100644 index 000000000..23d00b568 --- /dev/null +++ b/abydos/distance/_consonni_todeschini_v.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._consonni_todeschini_v. + +Consonni & Todeschini V correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log1p + +from ._token_distance import _TokenDistance + +__all__ = ['ConsonniTodeschiniV'] + + +class ConsonniTodeschiniV(_TokenDistance): + r"""Consonni & Todeschini V correlation. + + For two sets X and Y and a population N, Consonni & Todeschini V + correlation :cite:`Consonni:2012` is + + .. math:: + + corr_{ConsonniTodeschiniV}(X, Y) = + \frac{log(1+|X \cap Y| \cdot |(N \setminus X) \setminus Y|)- + log(1+|X \setminus Y| \cdot |Y \setminus X|)} + {log(1+\frac{|N|^2}{4})} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{ConsonniTodeschiniV} = + \frac{log(1+ad)-log(1+bc)}{log(1+\frac{n^2}{4})} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ConsonniTodeschiniV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ConsonniTodeschiniV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Consonni & Todeschini V correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Consonni & Todeschini V correlation + + Examples + -------- + >>> cmp = ConsonniTodeschiniV() + >>> cmp.corr('cat', 'hat') + 0.48072545510682463 + >>> cmp.corr('Niall', 'Neil') + 0.4003930264973547 + >>> cmp.corr('aluminum', 'Catalan') + 0.21794239483504532 + >>> cmp.corr('ATCG', 'TAGC') + -0.2728145951429799 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + num = log1p(a * d) - log1p(b * c) + if num == 0.0: + return 0.0 + + return num / log1p(n ** 2 / 4) + + def sim(self, src, tar): + """Return the Consonni & Todeschini V similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Consonni & Todeschini V similarity + + Examples + -------- + >>> cmp = ConsonniTodeschiniV() + >>> cmp.sim('cat', 'hat') + 0.7403627275534124 + >>> cmp.sim('Niall', 'Neil') + 0.7001965132486774 + >>> cmp.sim('aluminum', 'Catalan') + 0.6089711974175227 + >>> cmp.sim('ATCG', 'TAGC') + 0.36359270242851005 + + + .. versionadded:: 0.4.0 + + """ + return (1 + self.corr(src, tar)) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_cormode_lz.py b/abydos/distance/_cormode_lz.py new file mode 100644 index 000000000..712e60708 --- /dev/null +++ b/abydos/distance/_cormode_lz.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._cormode_lz. + +Cormode's LZ distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._distance import _Distance + +__all__ = ['CormodeLZ'] + + +class CormodeLZ(_Distance): + r"""Cormode's LZ distance. + + Cormode's LZ distance :cite:`Cormode:2000,Cormode:2003` + + .. versionadded:: 0.4.0 + """ + + def __init__(self, **kwargs): + """Initialize CormodeLZ instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(CormodeLZ, self).__init__(**kwargs) + + def dist_abs(self, src, tar): + """Return the Cormode's LZ distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Cormode's LZ distance + + Examples + -------- + >>> cmp = CormodeLZ() + >>> cmp.dist_abs('cat', 'hat') + 2 + >>> cmp.dist_abs('Niall', 'Neil') + 5 + >>> cmp.dist_abs('aluminum', 'Catalan') + 6 + >>> cmp.dist_abs('ATCG', 'TAGC') + 4 + + + .. versionadded:: 0.4.0 + + """ + edits = 0 + pos = 0 + span = 1 + + while max(pos + 1, pos + span) <= len(src): + if (src[pos : pos + span] in tar) or ( + src[pos : pos + span] in src[:pos] + ): + span += 1 + else: + edits += 1 + pos += max(1, span - 1) + span = 1 + + return 1 + edits + + def dist(self, src, tar): + """Return the normalized Cormode's LZ distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Cormode's LZ distance + + Examples + -------- + >>> cmp = CormodeLZ() + >>> cmp.dist('cat', 'hat') + 0.3333333333333333 + >>> cmp.dist('Niall', 'Neil') + 0.8 + >>> cmp.dist('aluminum', 'Catalan') + 0.625 + >>> cmp.dist('ATCG', 'TAGC') + 0.75 + + + .. versionadded:: 0.4.0 + + """ + num = self.dist_abs(src, tar) - 1 + if num == 0: + return 0.0 + return num / len(src) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_cosine.py b/abydos/distance/_cosine.py index d527c9d57..a3246177b 100644 --- a/abydos/distance/_cosine.py +++ b/abydos/distance/_cosine.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -30,7 +30,10 @@ from math import sqrt +from deprecation import deprecated + from ._token_distance import _TokenDistance +from .. import __version__ __all__ = ['Cosine', 'dist_cosine', 'sim_cosine'] @@ -39,11 +42,59 @@ class Cosine(_TokenDistance): r"""Cosine similarity. For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or - Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is: - :math:`sim_{cosine}(X, Y) = \frac{|X \cap Y|}{\sqrt{|X| \cdot |Y|}}`. + Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is + + .. math:: + + sim_{cosine}(X, Y) = \frac{|X \cap Y|}{\sqrt{|X| \cdot |Y|}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{cosine} = + \frac{a}{\sqrt{(a+b)(a+c)}} + + .. versionadded:: 0.3.6 """ - def sim(self, src, tar, qval=2): + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Cosine instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Cosine, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): r"""Return the cosine similarity of two strings. Parameters @@ -52,8 +103,6 @@ def sim(self, src, tar, qval=2): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version Returns ------- @@ -72,20 +121,32 @@ def sim(self, src, tar, qval=2): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 1.0 if not src or not tar: return 0.0 - q_src, q_tar = self._get_qgrams(src, tar, qval) - q_src_mag = sum(q_src.values()) - q_tar_mag = sum(q_tar.values()) - q_intersection_mag = sum((q_src & q_tar).values()) + self._tokenize(src, tar) + + num = self._intersection_card() - return q_intersection_mag / sqrt(q_src_mag * q_tar_mag) + if num: + return num / sqrt(self._src_card() * self._tar_card()) + return 0.0 +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Cosine.sim method instead.', +) def sim_cosine(src, tar, qval=2): r"""Return the cosine similarity of two strings. @@ -98,7 +159,7 @@ def sim_cosine(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -116,10 +177,18 @@ def sim_cosine(src, tar, qval=2): >>> sim_cosine('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ - return Cosine().sim(src, tar, qval) + return Cosine(qval=qval).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Cosine.dist method instead.', +) def dist_cosine(src, tar, qval=2): """Return the cosine distance between two strings. @@ -132,7 +201,7 @@ def dist_cosine(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -150,8 +219,10 @@ def dist_cosine(src, tar, qval=2): >>> dist_cosine('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ - return Cosine().dist(src, tar, qval) + return Cosine(qval=qval).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_covington.py b/abydos/distance/_covington.py new file mode 100644 index 000000000..6687ea113 --- /dev/null +++ b/abydos/distance/_covington.py @@ -0,0 +1,293 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._covington. + +Covington distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import namedtuple +from unicodedata import normalize as unicode_normalize + +from ._distance import _Distance + +__all__ = ['Covington'] + +Alignment = namedtuple('Alignment', ['src', 'tar', 'score']) + + +class Covington(_Distance): + r"""Covington distance. + + Covington distance :cite:`Covington:1996` + + .. versionadded:: 0.4.0 + """ + + def __init__(self, weights=(0, 5, 10, 30, 60, 100, 40, 50), **kwargs): + """Initialize Covington instance. + + Parameters + ---------- + weights : tuple + An 8-tuple of costs for each kind of match or mismatch described in + Covington's paper: + + - exact consonant or glide match + - exact vowel match + - vowel-vowel length mismatch or i and y or u and w + - vowel-vowel mismatch + - consonant-consonant mismatch + - consonant-vowel mismatch + - skip preceded by a skip + - skip not preceded by a skip + + The weights used in Covington's first approximation can be used + by supplying the tuple (0.0, 0.0, 0.5, 0.5, 0.5, 1.0, 0.5, 0.5) + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Covington, self).__init__(**kwargs) + self._weights = weights + self._vowels = set('aeiou') + self._consonants = set('bcdfghjklmnpqrstvxz') + self._glides = set('wy') + + def dist_abs(self, src, tar): + """Return the Covington distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Covington distance + + Examples + -------- + >>> cmp = Covington() + >>> cmp.dist_abs('cat', 'hat') + 65 + >>> cmp.dist_abs('Niall', 'Neil') + 115 + >>> cmp.dist_abs('aluminum', 'Catalan') + 325 + >>> cmp.dist_abs('ATCG', 'TAGC') + 200 + + + .. versionadded:: 0.4.0 + + """ + return self.alignments(src, tar, 1)[0][-1] + + def dist(self, src, tar): + """Return the Covington distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Covington distance + + Examples + -------- + >>> cmp = Covington() + >>> cmp.dist('cat', 'hat') + 0.19117647058823528 + >>> cmp.dist('Niall', 'Neil') + 0.25555555555555554 + >>> cmp.dist('aluminum', 'Catalan') + 0.43333333333333335 + >>> cmp.dist('ATCG', 'TAGC') + 0.45454545454545453 + + + .. versionadded:: 0.4.0 + + """ + normalizer = self._weights[5] * min(len(src), len(tar)) + if len(src) != len(tar): + normalizer += self._weights[7] + normalizer += self._weights[6] * abs(abs(len(src) - len(tar)) - 1) + + return self.dist_abs(src, tar) / normalizer + + def alignments(self, src, tar, top_n=None): + """Return the Covington distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + top_n : int + The number of alignments to return. If None, all alignments will + be returned. If 0, all alignments with the top score will be + returned. + + Returns + ------- + list + Covington alignments + + Examples + -------- + >>> cmp = Covington() + >>> cmp.alignments('hart', 'kordis', top_n=1)[0] + Alignment(src='hart--', tar='kordis', score=240) + >>> cmp.alignments('niy', 'genu', top_n=1)[0] + Alignment(src='--niy', tar='genu-', score=170) + + + .. versionadded:: 0.4.0 + + """ + if not src: + if not tar: + return [['', '', 0]] + return [ + [ + '-' * len(tar), + src, + self._weights[7] + self._weights[6] * (len(tar) - 1), + ] + ] + if not tar: + return [ + [ + src, + '-' * len(src), + self._weights[7] + self._weights[6] * (len(src) - 1), + ] + ] + + terminals = [] + + def _cost(s, t): + if s[-1:] == '-': + if s[-2:] == '--': + return self._weights[6] + else: + return self._weights[7] + elif t[-1:] == '-': + if t[-2:] == '--': + return self._weights[6] + else: + return self._weights[7] + + s = unicode_normalize('NFC', s)[-1:] + t = unicode_normalize('NFC', t)[-1:] + + if s == t: + if s in self._consonants or s in self._glides: + return self._weights[0] + else: + return self._weights[1] + + if ''.join(sorted([s, t])) in {'iy', 'uw'}: + return self._weights[2] + + sd = unicode_normalize('NFKD', s) + td = unicode_normalize('NFKD', t) + + if sd[0] == td[0] and s in self._vowels: + return self._weights[2] + + if sd[0] in self._vowels and td[0] in self._vowels: + return self._weights[3] + if sd[0] in self._consonants and td[0] in self._consonants: + return self._weights[4] + + return self._weights[5] + + def _add_alignments(cost, src, tar, src_align, tar_align): + cost += _cost(src_align, tar_align) + + if src and tar: + _add_alignments( + cost, + src[1:], + tar[1:], + src_align + src[0], + tar_align + tar[0], + ) + if tar and tar_align[-1] != '-': + _add_alignments( + cost, src, tar[1:], src_align + '-', tar_align + tar[0] + ) + if src and src_align[-1] != '-': + _add_alignments( + cost, src[1:], tar, src_align + src[0], tar_align + '-' + ) + + if not src and not tar: + terminals.append(Alignment(src_align, tar_align, cost)) + + return + + _add_alignments(0, src, tar[1:], '-', tar[0]) + _add_alignments(0, src[1:], tar, src[0], '-') + _add_alignments(0, src[1:], tar[1:], src[0], tar[0]) + + def _score(al): + return al.score + + terminals = sorted(terminals, key=_score) + + if top_n == 0: + top_score = terminals[0].score + top_n = 1 + while ( + top_n < len(terminals) and terminals[top_n].score == top_score + ): + top_n += 1 + + if top_n is None: + return terminals + else: + return terminals[:top_n] + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_damerau_levenshtein.py b/abydos/distance/_damerau_levenshtein.py index ebfec47cd..fa423f40c 100644 --- a/abydos/distance/_damerau_levenshtein.py +++ b/abydos/distance/_damerau_levenshtein.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -30,12 +30,15 @@ from sys import maxsize +from deprecation import deprecated + from numpy import int as np_int from numpy import zeros as np_zeros from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = [ 'DamerauLevenshtein', @@ -54,7 +57,31 @@ class DamerauLevenshtein(_Distance): https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java """ - def dist_abs(self, src, tar, cost=(1, 1, 1, 1)): + def __init__(self, cost=(1, 1, 1, 1), normalizer=max, **kwargs): + """Initialize Levenshtein instance. + + Parameters + ---------- + cost : tuple + A 4-tuple representing the cost of the four possible edits: + inserts, deletes, substitutions, and transpositions, respectively + (by default: (1, 1, 1, 1)) + normalizer : function + A function that takes an list and computes a normalization term + by which the edit distance is divided (max by default). Another + good option is the sum function. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(DamerauLevenshtein, self).__init__(**kwargs) + self._cost = cost + self._normalizer = normalizer + + def dist_abs(self, src, tar): """Return the Damerau-Levenshtein distance between two strings. Parameters @@ -63,10 +90,6 @@ def dist_abs(self, src, tar, cost=(1, 1, 1, 1)): Source string for comparison tar : str Target string for comparison - cost : tuple - A 4-tuple representing the cost of the four possible edits: - inserts, deletes, substitutions, and transpositions, respectively - (by default: (1, 1, 1, 1)) Returns ------- @@ -91,8 +114,13 @@ def dist_abs(self, src, tar, cost=(1, 1, 1, 1)): >>> cmp.dist_abs('ATCG', 'TAGC') 2 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - ins_cost, del_cost, sub_cost, trans_cost = cost + ins_cost, del_cost, sub_cost, trans_cost = self._cost if src == tar: return 0 @@ -107,9 +135,7 @@ def dist_abs(self, src, tar, cost=(1, 1, 1, 1)): + 'must not be less than the cost of an insert plus a delete.' ) - d_mat = np_zeros((len(src)) * (len(tar)), dtype=np_int).reshape( - (len(src), len(tar)) - ) + d_mat = np_zeros((len(src), len(tar)), dtype=np_int) if src[0] != tar[0]: d_mat[0, 0] = min(sub_cost, ins_cost + del_cost) @@ -173,7 +199,7 @@ def dist_abs(self, src, tar, cost=(1, 1, 1, 1)): return d_mat[len(src) - 1, len(tar) - 1] - def dist(self, src, tar, cost=(1, 1, 1, 1)): + def dist(self, src, tar): """Return the Damerau-Levenshtein similarity of two strings. Damerau-Levenshtein distance normalized to the interval [0, 1]. @@ -191,10 +217,6 @@ def dist(self, src, tar, cost=(1, 1, 1, 1)): Source string for comparison tar : str Target string for comparison - cost : tuple - A 4-tuple representing the cost of the four possible edits: - inserts, deletes, substitutions, and transpositions, respectively - (by default: (1, 1, 1, 1)) Returns ------- @@ -213,15 +235,26 @@ def dist(self, src, tar, cost=(1, 1, 1, 1)): >>> cmp.dist('ATCG', 'TAGC') 0.5 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 - ins_cost, del_cost = cost[:2] - return self.dist_abs(src, tar, cost) / ( - max(len(src) * del_cost, len(tar) * ins_cost) + ins_cost, del_cost = self._cost[:2] + return self.dist_abs(src, tar) / ( + self._normalizer([len(src) * del_cost, len(tar) * ins_cost]) ) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the DamerauLevenshtein.dist_abs method instead.', +) def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)): """Return the Damerau-Levenshtein distance between two strings. @@ -254,10 +287,18 @@ def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)): >>> damerau_levenshtein('ATCG', 'TAGC') 2 + .. versionadded:: 0.1.0 + """ - return DamerauLevenshtein().dist_abs(src, tar, cost) + return DamerauLevenshtein(cost).dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the DamerauLevenshtein.dist method instead.', +) def dist_damerau(src, tar, cost=(1, 1, 1, 1)): """Return the Damerau-Levenshtein similarity of two strings. @@ -290,10 +331,18 @@ def dist_damerau(src, tar, cost=(1, 1, 1, 1)): >>> dist_damerau('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + """ - return DamerauLevenshtein().dist(src, tar, cost) + return DamerauLevenshtein(cost).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the DamerauLevenshtein.sim method instead.', +) def sim_damerau(src, tar, cost=(1, 1, 1, 1)): """Return the Damerau-Levenshtein similarity of two strings. @@ -326,8 +375,10 @@ def sim_damerau(src, tar, cost=(1, 1, 1, 1)): >>> sim_damerau('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + """ - return DamerauLevenshtein().sim(src, tar, cost) + return DamerauLevenshtein(cost).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_dennis.py b/abydos/distance/_dennis.py new file mode 100644 index 000000000..8de00c730 --- /dev/null +++ b/abydos/distance/_dennis.py @@ -0,0 +1,232 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._dennis. + +Dennis similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Dennis'] + + +class Dennis(_TokenDistance): + r"""Dennis similarity. + + For two sets X and Y and a population N, Dennis similarity + :cite:`Dennis:1965` is + + .. math:: + + sim_{Dennis}(X, Y) = + \frac{|X \cap Y| - \frac{|X| \cdot |Y|}{|N|}} + {\sqrt{\frac{|X|\cdot|Y|}{|N|}}} + + This is the fourth of Dennis' association measures, and that which she + claims is the best of the four. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Dennis} = + \frac{a-\frac{(a+b)(a+c)}{n}}{\sqrt{\frac{(a+b)(a+c)}{n}}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Dennis instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Dennis, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Dennis similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Dennis similarity + + Examples + -------- + >>> cmp = Dennis() + >>> cmp.sim_score('cat', 'hat') + 13.857142857142858 + >>> cmp.sim_score('Niall', 'Neil') + 10.028539207654113 + >>> cmp.sim_score('aluminum', 'Catalan') + 2.9990827802847835 + >>> cmp.sim_score('ATCG', 'TAGC') + -0.17857142857142858 + + + .. versionadded:: 0.4.0 + + """ + if not src and not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + abacn = ( + self._src_card() + * self._tar_card() + / self._population_unique_card() + ) + + num = a - abacn + if num == 0: + return 0.0 + + return num / abacn ** 0.5 + + def corr(self, src, tar): + """Return the Dennis correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Dennis correlation + + Examples + -------- + >>> cmp = Dennis() + >>> cmp.corr('cat', 'hat') + 0.494897959183673 + >>> cmp.corr('Niall', 'Neil') + 0.358162114559075 + >>> cmp.corr('aluminum', 'Catalan') + 0.107041854561785 + >>> cmp.corr('ATCG', 'TAGC') + -0.006377551020408 + + + .. versionadded:: 0.4.0 + + """ + score = self.sim_score(src, tar) + if score == 0.0: + return 0.0 + return round(score / self._population_unique_card() ** 0.5, 15) + + def sim(self, src, tar): + """Return the normalized Dennis similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Dennis similarity + + Examples + -------- + >>> cmp = Dennis() + >>> cmp.sim('cat', 'hat') + 0.6632653061224487 + >>> cmp.sim('Niall', 'Neil') + 0.5721080763727167 + >>> cmp.sim('aluminum', 'Catalan') + 0.4046945697078567 + >>> cmp.sim('ATCG', 'TAGC') + 0.32908163265306134 + + + .. versionadded:: 0.4.0 + + """ + return (0.5 + self.corr(src, tar)) / 1.5 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_dice.py b/abydos/distance/_dice.py index 377f2da93..512d2412c 100644 --- a/abydos/distance/_dice.py +++ b/abydos/distance/_dice.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._tversky import Tversky +from .. import __version__ __all__ = ['Dice', 'dist_dice', 'sim_dice'] @@ -37,15 +40,79 @@ class Dice(Tversky): r"""Sørensen–Dice coefficient. For two sets X and Y, the Sørensen–Dice coefficient - :cite:`Dice:1945,Sorensen:1948` is - :math:`sim_{dice}(X, Y) = \frac{2 \cdot |X \cap Y|}{|X| + |Y|}`. + :cite:`Dice:1945,Sorensen:1948,Bray:1957,Czekanowski:1909,Motyka:1950` is + + .. math:: + + sim_{Dice}(X, Y) = \frac{2 \cdot |X \cap Y|}{|X| + |Y|} + + This is the complement of Bray & Curtis dissimilarity :cite:`Bray:1957`, + also known as the Lance & Williams dissimilarity :cite:`Lance:1967`. This is identical to the Tanimoto similarity coefficient :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for :math:`\alpha = \beta = 0.5`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Dice} = + \frac{2a}{2a+b+c} + + Notes + ----- + In terms of a confusion matrix, this is equivalent to :math:`F_1` score + :py:meth:`ConfusionTable.f1_score`. + + The multiset variant is termed Gleason similarity :cite:`Gleason:1920`. + + .. versionadded:: 0.3.6 + """ - def sim(self, src, tar, qval=2): + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Dice instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Dice, self).__init__( + alpha=0.5, + beta=0.5, + bias=None, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): """Return the Sørensen–Dice coefficient of two strings. Parameters @@ -54,8 +121,6 @@ def sim(self, src, tar, qval=2): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version Returns ------- @@ -74,10 +139,21 @@ def sim(self, src, tar, qval=2): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return super(self.__class__, self).sim(src, tar, qval, 0.5, 0.5) + return super(Dice, self).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Dice.sim method instead.', +) def sim_dice(src, tar, qval=2): """Return the Sørensen–Dice coefficient of two strings. @@ -90,7 +166,7 @@ def sim_dice(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -108,10 +184,18 @@ def sim_dice(src, tar, qval=2): >>> sim_dice('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ - return Dice().sim(src, tar, qval) + return Dice(qval=qval).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Dice.dist method instead.', +) def dist_dice(src, tar, qval=2): """Return the Sørensen–Dice distance between two strings. @@ -124,7 +208,7 @@ def dist_dice(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -142,8 +226,10 @@ def dist_dice(src, tar, qval=2): >>> dist_dice('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ - return Dice().dist(src, tar, qval) + return Dice(qval=qval).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_dice_asymmetric_i.py b/abydos/distance/_dice_asymmetric_i.py new file mode 100644 index 000000000..6cc30b4d4 --- /dev/null +++ b/abydos/distance/_dice_asymmetric_i.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._dice_asymmetric_i. + +Dice's Asymmetric I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['DiceAsymmetricI'] + + +class DiceAsymmetricI(_TokenDistance): + r"""Dice's Asymmetric I similarity. + + For two sets X and Y and a population N, Dice's Asymmetric I similarity + :cite:`Dice:1945` is + + .. math:: + + sim_{DiceAsymmetricI}(X, Y) = + \frac{|X \cap Y|}{|X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{DiceAsymmetricI} = + \frac{a}{a+b} + + Notes + ----- + In terms of a confusion matrix, this is equivalent to precision or + positive predictive value :py:meth:`ConfusionTable.precision`. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize DiceAsymmetricI instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(DiceAsymmetricI, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Dice's Asymmetric I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Dice's Asymmetric I similarity + + Examples + -------- + >>> cmp = DiceAsymmetricI() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3333333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.1111111111111111 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + ab = self._src_card() + + if a == 0.0: + return 0.0 + return a / ab + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_dice_asymmetric_ii.py b/abydos/distance/_dice_asymmetric_ii.py new file mode 100644 index 000000000..47f5c3b62 --- /dev/null +++ b/abydos/distance/_dice_asymmetric_ii.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._dice_asymmetric_ii. + +Dice's Asymmetric II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['DiceAsymmetricII'] + + +class DiceAsymmetricII(_TokenDistance): + r"""Dice's Asymmetric II similarity. + + For two sets X and Y, Dice's Asymmetric II similarity + :cite:`Dice:1945` is + + .. math:: + + sim_{DiceAsymmetricII}(X, Y) = + \frac{|X \cap Y|}{|Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{DiceAsymmetricII} = + \frac{a}{a+c} + + Notes + ----- + In terms of a confusion matrix, this is equivalent to recall, sensitivity, + or true positive rate :py:meth:`ConfusionTable.recall`. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize DiceAsymmetricII instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(DiceAsymmetricII, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Dice's Asymmetric II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Dice's Asymmetric II similarity + + Examples + -------- + >>> cmp = DiceAsymmetricII() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.4 + >>> cmp.sim('aluminum', 'Catalan') + 0.125 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + ac = self._tar_card() + + if a == 0.0: + return 0.0 + return a / ac + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_digby.py b/abydos/distance/_digby.py new file mode 100644 index 000000000..0f37a9403 --- /dev/null +++ b/abydos/distance/_digby.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._digby. + +Digby correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Digby'] + + +class Digby(_TokenDistance): + r"""Digby correlation. + + For two sets X and Y and a population N, Digby's approximation of the + tetrachoric correlation coefficient + :cite:`Digby:1983` is + + .. math:: + + corr_{Digby}(X, Y) = + \frac{(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4}- + (|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}} + {(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4} + + (|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Digby} = + \frac{ad^\frac{3}{4}-bc^\frac{3}{4}}{ad^\frac{3}{4}+bc^\frac{3}{4}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Digby instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Digby, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Digby correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Digby correlation + + Examples + -------- + >>> cmp = Digby() + >>> cmp.corr('cat', 'hat') + 0.9774244829419212 + >>> cmp.corr('Niall', 'Neil') + 0.9491281473458171 + >>> cmp.corr('aluminum', 'Catalan') + 0.7541039303781305 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return -1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = (a * d) ** 0.75 - (b * c) ** 0.75 + if num: + return num / ((a * d) ** 0.75 + (b * c) ** 0.75) + return 0.0 + + def sim(self, src, tar): + """Return the Digby similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Digby similarity + + Examples + -------- + >>> cmp = Digby() + >>> cmp.sim('cat', 'hat') + 0.9887122414709606 + >>> cmp.sim('Niall', 'Neil') + 0.9745640736729085 + >>> cmp.sim('aluminum', 'Catalan') + 0.8770519651890653 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1 + self.corr(src, tar)) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_dispersion.py b/abydos/distance/_dispersion.py new file mode 100644 index 000000000..eaf0f97ea --- /dev/null +++ b/abydos/distance/_dispersion.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._dispersion. + +Dispersion correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Dispersion'] + + +class Dispersion(_TokenDistance): + r"""Dispersion correlation. + + For two sets X and Y and a population N, the dispersion + correlation :cite:`IBM:2017` is + + .. math:: + + corr_{dispersion}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {|N|^2} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{dispersion} = + \frac{ad-bc}{n^2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Dispersion instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Dispersion, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Dispersion correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Dispersion correlation + + Examples + -------- + >>> cmp = Dispersion() + >>> cmp.corr('cat', 'hat') + 0.002524989587671803 + >>> cmp.corr('Niall', 'Neil') + 0.002502212619741774 + >>> cmp.corr('aluminum', 'Catalan') + 0.0011570449105440383 + >>> cmp.corr('ATCG', 'TAGC') + -4.06731570179092e-05 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + admbc = a * d - b * c + if admbc == 0.0: + return 0.0 + return admbc / n ** 2 + + def sim(self, src, tar): + """Return the Dispersion similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Dispersion similarity + + Examples + -------- + >>> cmp = Dispersion() + >>> cmp.sim('cat', 'hat') + 0.5012624947938359 + >>> cmp.sim('Niall', 'Neil') + 0.5012511063098709 + >>> cmp.sim('aluminum', 'Catalan') + 0.500578522455272 + >>> cmp.sim('ATCG', 'TAGC') + 0.499979663421491 + + + .. versionadded:: 0.4.0 + + """ + return (1 + self.corr(src, tar)) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_distance.py b/abydos/distance/_distance.py index 66816f0d9..b9a48bb88 100644 --- a/abydos/distance/_distance.py +++ b/abydos/distance/_distance.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,11 +28,46 @@ unicode_literals, ) +__all__ = ['_Distance'] + class _Distance(object): - """Abstract Distance class.""" + """Abstract Distance class. + + .. versionadded:: 0.3.6 + """ + + def __init__(self, **kwargs): + """Initialize _Distance instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + - def sim(self, src, tar, *args, **kwargs): + .. versionadded:: 0.4.0 + + """ + self.params = {} + self.set_params(**kwargs) + + def set_params(self, **kwargs): + """Store params in the params dict. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + for key in kwargs: + self.params[key] = kwargs[key] + + def sim(self, src, tar): """Return similarity. Parameters @@ -41,20 +76,19 @@ def sim(self, src, tar, *args, **kwargs): Source string for comparison tar : str Target string for comparison - *args - Variable length argument list. - **kwargs - Arbitrary keyword arguments. Returns ------- float Similarity + + .. versionadded:: 0.3.6 + """ - return 1.0 - self.dist(src, tar, *args, **kwargs) + return 1.0 - self.dist(src, tar) - def dist(self, src, tar, *args, **kwargs): + def dist(self, src, tar): """Return distance. Parameters @@ -63,20 +97,19 @@ def dist(self, src, tar, *args, **kwargs): Source string for comparison tar : str Target string for comparison - *args - Variable length argument list. - **kwargs - Arbitrary keyword arguments. Returns ------- float Distance + + .. versionadded:: 0.3.6 + """ - return 1.0 - self.sim(src, tar, *args, **kwargs) + return 1.0 - self.sim(src, tar) - def dist_abs(self, src, tar, *args, **kwargs): + def dist_abs(self, src, tar): """Return absolute distance. Parameters @@ -85,18 +118,17 @@ def dist_abs(self, src, tar, *args, **kwargs): Source string for comparison tar : str Target string for comparison - *args - Variable length argument list. - **kwargs - Arbitrary keyword arguments. Returns ------- int Absolute distance + + .. versionadded:: 0.3.6 + """ - return self.dist(src, tar, *args, **kwargs) + return self.dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_doolittle.py b/abydos/distance/_doolittle.py new file mode 100644 index 000000000..2253b3a9d --- /dev/null +++ b/abydos/distance/_doolittle.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._doolittle. + +Doolittle similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Doolittle'] + + +class Doolittle(_TokenDistance): + r"""Doolittle similarity. + + For two sets X and Y and a population N, the Doolittle + similarity :cite:`Doolittle:1884` is + + .. math:: + + sim_{Doolittle}(X, Y) = + \frac{(|X \cap Y| \cdot |N| - |X| \cdot |Y|)^2} + {|X| \cdot |Y| \cdot |N \setminus Y| \cdot |N \setminus X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Doolittle} = + \frac{(an-(a+b)(a+c))^2}{(a+b)(a+c)(b+d)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Doolittle instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Doolittle, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Doolittle similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Doolittle similarity + + Examples + -------- + >>> cmp = Doolittle() + >>> cmp.sim('cat', 'hat') + 0.24744247205785666 + >>> cmp.sim('Niall', 'Neil') + 0.13009912077202224 + >>> cmp.sim('aluminum', 'Catalan') + 0.011710186806836291 + >>> cmp.sim('ATCG', 'TAGC') + 4.1196952743799446e-05 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = (a * d - b * c) ** 2 + if num == 0.0: + return 0.0 + + return num / ((a + b) * (a + c) * (b + d) * (c + d)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_dunning.py b/abydos/distance/_dunning.py new file mode 100644 index 000000000..ae5228d43 --- /dev/null +++ b/abydos/distance/_dunning.py @@ -0,0 +1,237 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._dunning. + +Dunning similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['Dunning'] + + +class Dunning(_TokenDistance): + r"""Dunning similarity. + + For two sets X and Y and a population N, Dunning log-likelihood + :cite:`Dunning:1993`, following :cite:`Church:1991`, is + + .. math:: + + sim_{Dunning}(X, Y) = \lambda = + |X \cap Y| \cdot log_2(|X \cap Y|) +\\ + |X \setminus Y| \cdot log_2(|X \setminus Y|) + + |Y \setminus X| \cdot log_2(|Y \setminus X|) +\\ + |(N \setminus X) \setminus Y| \cdot + log_2(|(N \setminus X) \setminus Y|) -\\ + (|X| \cdot log_2(|X|) + + |Y| \cdot log_2(|Y|) +\\ + |N \setminus Y| \cdot log_2(|N \setminus Y|) + + |N \setminus X| \cdot log_2(|N \setminus X|)) + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Dunning} = \lambda = + a \cdot log_2(a) +\\ + b \cdot log_2(b) + c \cdot log_2(c) + + d \cdot log_2(d) - \\ + ((a+b) \cdot log_2(a+b) + (a+c) \cdot log_2(a+c) +\\ + (b+d) \cdot log_2(b+d) + (c+d) log_2(c+d)) + + Notes + ----- + To avoid NaNs, every logarithm is calculated as the logarithm of 1 greater + than the value in question. (Python's math.log1p function is used.) + + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Dunning instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Dunning, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Dunning similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Dunning similarity + + Examples + -------- + >>> cmp = Dunning() + >>> cmp.sim('cat', 'hat') + 0.33462839191969423 + >>> cmp.sim('Niall', 'Neil') + 0.19229445539929793 + >>> cmp.sim('aluminum', 'Catalan') + 0.03220862737070572 + >>> cmp.sim('ATCG', 'TAGC') + 0.0010606026735052122 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + # a should not equal n, because 0 will result + # As a workaround, we set d to 1 and add one to n. + if a == n: + d = 1 + n += 1 + + a /= n + b /= n + c /= n + d /= n + + score = 0.0 + for i in [a, b, c, d]: + if i > 0: + score += i * log(i) + for i in [a, d]: + for j in [b, c]: + ij = i + j + if ij > 0: + score -= ij * log(ij) + score *= 2 + score /= log(2) + + return abs(round(score, 15)) + + def sim(self, src, tar): + """Return the normalized Dunning similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Dunning similarity + + Examples + -------- + >>> cmp = Dunning() + >>> cmp.sim('cat', 'hat') + 0.33462839191969423 + >>> cmp.sim('Niall', 'Neil') + 0.19229445539929793 + >>> cmp.sim('aluminum', 'Catalan') + 0.03220862737070572 + >>> cmp.sim('ATCG', 'TAGC') + 0.0010606026735052122 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + score = self.sim_score(src, tar) + if not score: + return 0.0 + + norm = max(self.sim_score(src, src), self.sim_score(tar, tar)) + return score / norm + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_editex.py b/abydos/distance/_editex.py index ad4cf49e3..0cd230816 100644 --- a/abydos/distance/_editex.py +++ b/abydos/distance/_editex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,15 +28,19 @@ unicode_literals, ) +from sys import float_info from unicodedata import normalize as unicode_normalize -from numpy import int as np_int +from deprecation import deprecated + +from numpy import float as np_float from numpy import zeros as np_zeros from six import text_type from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = ['Editex', 'dist_editex', 'editex', 'sim_editex'] @@ -47,6 +51,10 @@ class Editex(_Distance): As described on pages 3 & 4 of :cite:`Zobel:1996`. The local variant is based on :cite:`Ring:2009`. + + .. versionadded:: 0.3.6 + .. versionchanged:: 0.4.0 + Added taper option """ _letter_groups = ( @@ -63,20 +71,49 @@ class Editex(_Distance): _all_letters = frozenset('ABCDEFGIJKLMNOPQRSTUVXYZ') - def dist_abs(self, src, tar, cost=(0, 1, 2), local=False): - """Return the Editex distance between two strings. + def __init__(self, cost=(0, 1, 2), local=False, taper=False, **kwargs): + """Initialize Editex instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison cost : tuple A 3-tuple representing the cost of the four possible edits: match, same-group, and mismatch respectively (by default: (0, 1, 2)) local : bool If True, the local variant of Editex is used + taper : bool + Enables cost tapering. Following :cite:`Zobel:1996`, it causes + edits at the start of the string to "just [exceed] twice the + minimum penalty for replacement or deletion at the end of the + string". + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Editex, self).__init__(**kwargs) + self._cost = cost + self._local = local + self._taper_enabled = taper + + def _taper(self, pos, length): + return ( + round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15) + if self._taper_enabled + else 1 + ) + + def dist_abs(self, src, tar): + """Return the Editex distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -95,8 +132,13 @@ def dist_abs(self, src, tar, cost=(0, 1, 2), local=False): >>> cmp.dist_abs('ATCG', 'TAGC') 6 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - match_cost, group_cost, mismatch_cost = cost + match_cost, group_cost, mismatch_cost = self._cost def r_cost(ch1, ch2): """Return r(a,b) according to Zobel & Dart's definition. @@ -113,6 +155,8 @@ def r_cost(ch1, ch2): int r(a,b) according to Zobel & Dart's definition + .. versionadded:: 0.1.0 + """ if ch1 == ch2: return match_cost @@ -137,6 +181,8 @@ def d_cost(ch1, ch2): int d(a,b) according to Zobel & Dart's definition + .. versionadded:: 0.1.0 + """ if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'): return group_cost @@ -149,36 +195,56 @@ def d_cost(ch1, ch2): src = src.replace('ß', 'SS') tar = tar.replace('ß', 'SS') + src_len = len(src) + tar_len = len(tar) + max_len = max(src_len, tar_len) + if src == tar: return 0.0 if not src: - return len(tar) * mismatch_cost + return sum( + mismatch_cost * self._taper(pos, max_len) + for pos in range(tar_len) + ) if not tar: - return len(src) * mismatch_cost + return sum( + mismatch_cost * self._taper(pos, max_len) + for pos in range(src_len) + ) - d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) - lens = len(src) - lent = len(tar) + d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) src = ' ' + src tar = ' ' + tar - if not local: - for i in range(1, lens + 1): - d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(src[i - 1], src[i]) - for j in range(1, lent + 1): - d_mat[0, j] = d_mat[0, j - 1] + d_cost(tar[j - 1], tar[j]) - - for i in range(1, lens + 1): - for j in range(1, lent + 1): + if not self._local: + for i in range(1, src_len + 1): + d_mat[i, 0] = d_mat[i - 1, 0] + d_cost( + src[i - 1], src[i] + ) * self._taper(i, max_len) + for j in range(1, tar_len + 1): + d_mat[0, j] = d_mat[0, j - 1] + d_cost( + tar[j - 1], tar[j] + ) * self._taper(j, max_len) + + for i in range(1, src_len + 1): + for j in range(1, tar_len + 1): d_mat[i, j] = min( - d_mat[i - 1, j] + d_cost(src[i - 1], src[i]), - d_mat[i, j - 1] + d_cost(tar[j - 1], tar[j]), - d_mat[i - 1, j - 1] + r_cost(src[i], tar[j]), + d_mat[i - 1, j] + + d_cost(src[i - 1], src[i]) + * self._taper(max(i, j), max_len), + d_mat[i, j - 1] + + d_cost(tar[j - 1], tar[j]) + * self._taper(max(i, j), max_len), + d_mat[i - 1, j - 1] + + r_cost(src[i], tar[j]) * self._taper(max(i, j), max_len), ) - return d_mat[lens, lent] + if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]: + return int(d_mat[src_len, tar_len]) + else: + return d_mat[src_len, tar_len] - def dist(self, src, tar, cost=(0, 1, 2), local=False): + def dist(self, src, tar): """Return the normalized Editex distance between two strings. The Editex distance is normalized by dividing the Editex distance @@ -194,11 +260,6 @@ def dist(self, src, tar, cost=(0, 1, 2), local=False): Source string for comparison tar : str Target string for comparison - cost : tuple - A 3-tuple representing the cost of the four possible edits: match, - same-group, and mismatch respectively (by default: (0, 1, 2)) - local : bool - If True, the local variant of Editex is used Returns ------- @@ -217,15 +278,45 @@ def dist(self, src, tar, cost=(0, 1, 2), local=False): >>> cmp.dist('ATCG', 'TAGC') 0.75 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 - mismatch_cost = cost[2] - return self.dist_abs(src, tar, cost, local) / ( - max(len(src) * mismatch_cost, len(tar) * mismatch_cost) - ) - + match_cost, group_cost, mismatch_cost = self._cost + src_len = len(src) + tar_len = len(tar) + + if self._taper_enabled: + normalize_term = max( + [ + sum( + self._taper(pos, src_len) * mismatch_cost + for pos in range(src_len) + ), + sum( + self._taper(pos, tar_len) * mismatch_cost + for pos in range(tar_len) + ), + ] + ) + else: + normalize_term = max( + src_len * mismatch_cost, tar_len * mismatch_cost + ) + return self.dist_abs(src, tar) / normalize_term + + +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Editex.dist_abs method instead.', +) def editex(src, tar, cost=(0, 1, 2), local=False): """Return the Editex distance between two strings. @@ -259,10 +350,18 @@ def editex(src, tar, cost=(0, 1, 2), local=False): >>> editex('ATCG', 'TAGC') 6 + .. versionadded:: 0.1.0 + """ - return Editex().dist_abs(src, tar, cost, local) + return Editex(cost, local).dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Editex.dist method instead.', +) def dist_editex(src, tar, cost=(0, 1, 2), local=False): """Return the normalized Editex distance between two strings. @@ -296,10 +395,18 @@ def dist_editex(src, tar, cost=(0, 1, 2), local=False): >>> dist_editex('ATCG', 'TAGC') 0.75 + .. versionadded:: 0.1.0 + """ - return Editex().dist(src, tar, cost, local) + return Editex(cost, local).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Editex.sim method instead.', +) def sim_editex(src, tar, cost=(0, 1, 2), local=False): """Return the normalized Editex similarity of two strings. @@ -333,8 +440,10 @@ def sim_editex(src, tar, cost=(0, 1, 2), local=False): >>> sim_editex('ATCG', 'TAGC') 0.25 + .. versionadded:: 0.1.0 + """ - return Editex().sim(src, tar, cost, local) + return Editex(cost, local).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_euclidean.py b/abydos/distance/_euclidean.py index e9bdd84dc..5483aa2b9 100644 --- a/abydos/distance/_euclidean.py +++ b/abydos/distance/_euclidean.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._minkowski import Minkowski +from .. import __version__ __all__ = ['Euclidean', 'dist_euclidean', 'euclidean', 'sim_euclidean'] @@ -38,9 +41,54 @@ class Euclidean(Minkowski): Euclidean distance is the straigh-line or as-the-crow-flies distance, equivalent to Minkowski distance in :math:`L^2`-space. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, qval=2, normalized=False, alphabet=None): + def __init__( + self, alphabet=0, tokenizer=None, intersection_type='crisp', **kwargs + ): + """Initialize Euclidean instance. + + Parameters + ---------- + alphabet : collection or int + The values or size of the alphabet + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Euclidean, self).__init__( + pval=2, + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist_abs(self, src, tar, normalized=False): """Return the Euclidean distance between two strings. Parameters @@ -49,12 +97,9 @@ def dist_abs(self, src, tar, qval=2, normalized=False, alphabet=None): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version normalized : bool Normalizes to [0, 1] if True - alphabet : collection or int - The values or size of the alphabet + Returns ------- @@ -73,12 +118,15 @@ def dist_abs(self, src, tar, qval=2, normalized=False, alphabet=None): >>> round(cmp.dist_abs('ATCG', 'TAGC'), 12) 3.162277660168 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return super(self.__class__, self).dist_abs( - src, tar, qval, 2, normalized, alphabet - ) + return super(Euclidean, self).dist_abs(src, tar, normalized=normalized) - def dist(self, src, tar, qval=2, alphabet=None): + def dist(self, src, tar): """Return the normalized Euclidean distance between two strings. The normalized Euclidean distance is a distance @@ -90,10 +138,6 @@ def dist(self, src, tar, qval=2, alphabet=None): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version - alphabet : collection or int - The values or size of the alphabet Returns ------- @@ -112,11 +156,22 @@ def dist(self, src, tar, qval=2, alphabet=None): >>> cmp.dist('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return self.dist_abs(src, tar, qval, True, alphabet) + return self.dist_abs(src, tar, normalized=True) -def euclidean(src, tar, qval=2, normalized=False, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Euclidean.dist_abs method instead.', +) +def euclidean(src, tar, qval=2, normalized=False, alphabet=0): """Return the Euclidean distance between two strings. This is a wrapper for :py:meth:`Euclidean.dist_abs`. @@ -128,7 +183,7 @@ def euclidean(src, tar, qval=2, normalized=False, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram normalized : bool Normalizes to [0, 1] if True alphabet : collection or int @@ -149,11 +204,21 @@ def euclidean(src, tar, qval=2, normalized=False, alphabet=None): >>> round(euclidean('ATCG', 'TAGC'), 12) 3.162277660168 + .. versionadded:: 0.3.0 + """ - return Euclidean().dist_abs(src, tar, qval, normalized, alphabet) + return Euclidean(alphabet=alphabet, qval=qval).dist_abs( + src, tar, normalized=normalized + ) -def dist_euclidean(src, tar, qval=2, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Euclidean.dist method instead.', +) +def dist_euclidean(src, tar, qval=2, alphabet=0): """Return the normalized Euclidean distance between two strings. This is a wrapper for :py:meth:`Euclidean.dist`. @@ -165,7 +230,7 @@ def dist_euclidean(src, tar, qval=2, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram alphabet : collection or int The values or size of the alphabet @@ -185,11 +250,19 @@ def dist_euclidean(src, tar, qval=2, alphabet=None): >>> dist_euclidean('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.3.0 + """ - return Euclidean().dist(src, tar, qval, alphabet) + return Euclidean(alphabet=alphabet, qval=qval).dist(src, tar) -def sim_euclidean(src, tar, qval=2, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Euclidean.sim method instead.', +) +def sim_euclidean(src, tar, qval=2, alphabet=0): """Return the normalized Euclidean similarity of two strings. This is a wrapper for :py:meth:`Euclidean.sim`. @@ -201,7 +274,7 @@ def sim_euclidean(src, tar, qval=2, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram alphabet : collection or int The values or size of the alphabet @@ -221,8 +294,10 @@ def sim_euclidean(src, tar, qval=2, alphabet=None): >>> sim_euclidean('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.3.0 + """ - return Euclidean().sim(src, tar, qval, alphabet) + return Euclidean(alphabet=alphabet, qval=qval).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_eudex.py b/abydos/distance/_eudex.py index 231a58844..b5b6a6f7d 100644 --- a/abydos/distance/_eudex.py +++ b/abydos/distance/_eudex.py @@ -30,9 +30,12 @@ from types import GeneratorType +from deprecation import deprecated + from six.moves import range from ._distance import _Distance +from .. import __version__ from ..phonetic import eudex __all__ = ['Eudex', 'dist_eudex', 'eudex_hamming', 'sim_eudex'] @@ -42,6 +45,8 @@ class Eudex(_Distance): """Distance between the Eudex hashes of two terms. Cf. :cite:`Ticki:2016`. + + .. versionadded:: 0.3.6 """ @staticmethod @@ -56,6 +61,11 @@ def gen_fibonacci(): int The next Fibonacci number + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ num_a, num_b = 1, 2 while True: @@ -78,23 +88,22 @@ def gen_exponential(base=2): int The next power of `base` + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ exp = 0 while True: yield base ** exp exp += 1 - def dist_abs( - self, src, tar, weights='exponential', max_length=8, normalized=False - ): - """Calculate the distance between the Eudex hashes of two terms. + def __init__(self, weights='exponential', max_length=8, **kwargs): + """Initialize Eudex instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison weights : str, iterable, or generator function The weights or weights generator function @@ -110,8 +119,32 @@ def dist_abs( - If set to an iterable, the iterable's values should be integers and will be used as the weights. + In all cases, the weights should be ordered or generated from least + significant to most significant, so larger values should generally + come first. + max_length : int The number of characters to encode as a eudex hash + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Eudex, self).__init__(**kwargs) + self._weights = weights + self._max_length = max_length + + def dist_abs(self, src, tar, normalized=False): + """Calculate the distance between the Eudex hashes of two terms. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison normalized : bool Normalizes to [0, 1] if True @@ -132,43 +165,51 @@ def dist_abs( >>> cmp.dist_abs('ATCG', 'TAGC') 403 - >>> cmp.dist_abs('cat', 'hat', weights='fibonacci') + >>> cmp = Eudex(weights='fibonacci') + >>> cmp.dist_abs('cat', 'hat') 34 - >>> cmp.dist_abs('Niall', 'Neil', weights='fibonacci') + >>> cmp.dist_abs('Niall', 'Neil') 2 - >>> cmp.dist_abs('Colin', 'Cuilen', weights='fibonacci') + >>> cmp.dist_abs('Colin', 'Cuilen') 7 - >>> cmp.dist_abs('ATCG', 'TAGC', weights='fibonacci') + >>> cmp.dist_abs('ATCG', 'TAGC') 117 - >>> cmp.dist_abs('cat', 'hat', weights=None) + >>> cmp = Eudex(weights=None) + >>> cmp.dist_abs('cat', 'hat') 1 - >>> cmp.dist_abs('Niall', 'Neil', weights=None) + >>> cmp.dist_abs('Niall', 'Neil') 1 - >>> cmp.dist_abs('Colin', 'Cuilen', weights=None) + >>> cmp.dist_abs('Colin', 'Cuilen') 2 - >>> cmp.dist_abs('ATCG', 'TAGC', weights=None) + >>> cmp.dist_abs('ATCG', 'TAGC') 9 >>> # Using the OEIS A000142: - >>> cmp.dist_abs('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040]) + >>> cmp = Eudex(weights=[1, 1, 2, 6, 24, 120, 720, 5040]) + >>> cmp.dist_abs('cat', 'hat') + 5040 + >>> cmp.dist_abs('Niall', 'Neil') 1 - >>> cmp.dist_abs('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040]) - 720 - >>> cmp.dist_abs('Colin', 'Cuilen', - ... [1, 1, 2, 6, 24, 120, 720, 5040]) - 744 - >>> cmp.dist_abs('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040]) - 6243 + >>> cmp.dist_abs('Colin', 'Cuilen') + 7 + >>> cmp.dist_abs('ATCG', 'TAGC') + 15130 + + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class """ + # Calculate the eudex hashes and XOR them - xored = eudex(src, max_length=max_length) ^ eudex( - tar, max_length=max_length + xored = eudex(src, max_length=self._max_length) ^ eudex( + tar, max_length=self._max_length ) # Simple hamming distance (all bits are equal) - if not weights: + if not self._weights: binary = bin(xored) distance = binary.count('1') if normalized: @@ -177,14 +218,21 @@ def dist_abs( # If weights is a function, it should create a generator, # which we now use to populate a list - if callable(weights): - weights = weights() - elif weights == 'exponential': + if callable(self._weights): + weights = self._weights() + elif self._weights == 'exponential': weights = Eudex.gen_exponential() - elif weights == 'fibonacci': + elif self._weights == 'fibonacci': weights = Eudex.gen_fibonacci() + elif hasattr(self._weights, '__iter__') and not isinstance( + self._weights, str + ): + weights = self._weights[::-1] + else: + raise ValueError('Unrecognized weights value or type.') + if isinstance(weights, GeneratorType): - weights = [next(weights) for _ in range(max_length)][::-1] + weights = [next(weights) for _ in range(self._max_length)][::-1] # Sum the weighted hamming distance distance = 0 @@ -199,7 +247,7 @@ def dist_abs( return distance - def dist(self, src, tar, weights='exponential', max_length=8): + def dist(self, src, tar): """Return normalized distance between the Eudex hashes of two terms. This is Eudex distance normalized to [0, 1]. @@ -210,10 +258,6 @@ def dist(self, src, tar, weights='exponential', max_length=8): Source string for comparison tar : str Target string for comparison - weights : str, iterable, or generator function - The weights or weights generator function - max_length : int - The number of characters to encode as a eudex hash Returns ------- @@ -232,10 +276,21 @@ def dist(self, src, tar, weights='exponential', max_length=8): >>> round(cmp.dist('ATCG', 'TAGC'), 12) 0.197549019608 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return self.dist_abs(src, tar, weights, max_length, True) + return self.dist_abs(src, tar, True) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Eudex.dist_abs method instead.', +) def eudex_hamming( src, tar, weights='exponential', max_length=8, normalized=False ): @@ -292,18 +347,26 @@ def eudex_hamming( >>> # Using the OEIS A000142: >>> eudex_hamming('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040]) - 1 + 5040 >>> eudex_hamming('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040]) - 720 + 1 >>> eudex_hamming('Colin', 'Cuilen', [1, 1, 2, 6, 24, 120, 720, 5040]) - 744 + 7 >>> eudex_hamming('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040]) - 6243 + 15130 + + .. versionadded:: 0.3.0 """ - return Eudex().dist_abs(src, tar, weights, max_length, normalized) + return Eudex(weights, max_length).dist_abs(src, tar, normalized) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Eudex.dist method instead.', +) def dist_eudex(src, tar, weights='exponential', max_length=8): """Return normalized Hamming distance between Eudex hashes of two terms. @@ -336,10 +399,18 @@ def dist_eudex(src, tar, weights='exponential', max_length=8): >>> round(dist_eudex('ATCG', 'TAGC'), 12) 0.197549019608 + .. versionadded:: 0.3.0 + """ - return Eudex().dist(src, tar, weights, max_length) + return Eudex(weights, max_length).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Eudex.sim method instead.', +) def sim_eudex(src, tar, weights='exponential', max_length=8): """Return normalized Hamming similarity between Eudex hashes of two terms. @@ -372,8 +443,10 @@ def sim_eudex(src, tar, weights='exponential', max_length=8): >>> round(sim_eudex('ATCG', 'TAGC'), 12) 0.802450980392 + .. versionadded:: 0.3.0 + """ - return Eudex().sim(src, tar, weights, max_length) + return Eudex(weights, max_length).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_eyraud.py b/abydos/distance/_eyraud.py new file mode 100644 index 000000000..153a42caa --- /dev/null +++ b/abydos/distance/_eyraud.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._eyraud. + +Eyraud similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Eyraud'] + + +class Eyraud(_TokenDistance): + r"""Eyraud similarity. + + For two sets X and Y and a population N, the Eyraud + similarity :cite:`Eyraud:1938` is + + .. math:: + + sim_{Eyraud}(X, Y) = + \frac{|X \cap Y| - |X| \cdot |Y|} + {|X| \cdot |Y| \cdot |N \setminus Y| \cdot |N \setminus X|} + + For lack of access to the original, this formula is based on the concurring + formulae presented in :cite:`Shi:1993` and :cite:`Hubalek:1982`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Eyraud} = + \frac{a-(a+b)(a+c)}{(a+b)(a+c)(b+d)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Eyraud instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Eyraud, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Eyraud similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Eyraud similarity + + Examples + -------- + >>> cmp = Eyraud() + >>> cmp.sim_score('cat', 'hat') + -1.438198553583169e-06 + >>> cmp.sim_score('Niall', 'Neil') + -1.5399964580081465e-06 + >>> cmp.sim_score('aluminum', 'Catalan') + -1.6354719962967386e-06 + >>> cmp.sim_score('ATCG', 'TAGC') + -1.6478781097519779e-06 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + denom = max(1, a + b) * max(1, c + d) * max(1, a + c) * max(1, b + d) + num = a - (a + b) * (a + c) + + return num / denom + + def sim(self, src, tar): + """Return the normalized Eyraud similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Eyraud similarity + + Examples + -------- + >>> cmp = Eyraud() + >>> cmp.sim('cat', 'hat') + 1.438198553583169e-06 + >>> cmp.sim('Niall', 'Neil') + 1.5399964580081465e-06 + >>> cmp.sim('aluminum', 'Catalan') + 1.6354719962967386e-06 + >>> cmp.sim('ATCG', 'TAGC') + 1.6478781097519779e-06 + + + .. versionadded:: 0.4.0 + + """ + return 0.0 - self.sim_score(src, tar) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fager_mcgowan.py b/abydos/distance/_fager_mcgowan.py new file mode 100644 index 000000000..90d23625a --- /dev/null +++ b/abydos/distance/_fager_mcgowan.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fager_mcgowan. + +Fager & McGowan similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['FagerMcGowan'] + + +class FagerMcGowan(_TokenDistance): + r"""Fager & McGowan similarity. + + For two sets X and Y, the Fager & McGowan similarity + :cite:`Fager:1957,Fager:1963` is + + .. math:: + + sim_{FagerMcGowan}(X, Y) = + \frac{|X \cap Y|}{\sqrt{|X|\cdot|Y|}} - + \frac{1}{2\sqrt{max(|X|, |Y|)}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{FagerMcGowan} = + \frac{a}{\sqrt{(a+b)(a+c)}} - \frac{1}{2\sqrt{max(a+b, a+c)}} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize FagerMcGowan instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(FagerMcGowan, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim_score(self, src, tar): + """Return the Fager & McGowan similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Fager & McGowan similarity + + Examples + -------- + >>> cmp = FagerMcGowan() + >>> cmp.sim_score('cat', 'hat') + 0.25 + >>> cmp.sim_score('Niall', 'Neil') + 0.16102422643817918 + >>> cmp.sim_score('aluminum', 'Catalan') + -0.048815536468908724 + >>> cmp.sim_score('ATCG', 'TAGC') + -0.22360679774997896 + + + .. versionadded:: 0.4.0 + + """ + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + apb = self._src_card() + apc = self._tar_card() + + first = a / (apb * apc) ** 0.5 if a else 0.0 + second = 1 / (2 * (max(apb, apc) ** 0.5)) + + return first - second + + def sim(self, src, tar): + r"""Return the normalized Fager & McGowan similarity of two strings. + + As this similarity ranges from :math:`(-\inf, 1.0)`, this normalization + simply clamps the value to the range (0.0, 1.0). + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Fager & McGowan similarity + + Examples + -------- + >>> cmp = FagerMcGowan() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.16102422643817918 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return max(0.0, self.sim_score(src, tar)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_faith.py b/abydos/distance/_faith.py new file mode 100644 index 000000000..acf572f61 --- /dev/null +++ b/abydos/distance/_faith.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._faith. + +Faith similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Faith'] + + +class Faith(_TokenDistance): + r"""Faith similarity. + + For two sets X and Y and a population N, the Faith + similarity :cite:`Faith:1983` is + + .. math:: + + sim_{Faith}(X, Y) = \frac{|X \cap Y| + + \frac{|(N \setminus X) \setminus Y|}{2}}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Faith} = + \frac{a+\frac{d}{2}}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Faith instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Faith, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Faith similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Faith similarity + + Examples + -------- + >>> cmp = Faith() + >>> cmp.sim('cat', 'hat') + 0.4987244897959184 + >>> cmp.sim('Niall', 'Neil') + 0.4968112244897959 + >>> cmp.sim('aluminum', 'Catalan') + 0.4910828025477707 + >>> cmp.sim('ATCG', 'TAGC') + 0.49362244897959184 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + num = self._intersection_card() + self._total_complement_card() / 2 + if num == 0.0: + return 0.0 + + return num / self._population_unique_card() + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fellegi_sunter.py b/abydos/distance/_fellegi_sunter.py new file mode 100644 index 000000000..867f32fc2 --- /dev/null +++ b/abydos/distance/_fellegi_sunter.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fellegi_sunter. + +Fellegi-Sunter similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import exp, log +from sys import float_info + +from ._token_distance import _TokenDistance + +__all__ = ['FellegiSunter'] + + +class FellegiSunter(_TokenDistance): + r"""Fellegi-Sunter similarity. + + Fellegi-Sunter similarity is based on the description in + :cite:`Cohen:2003` and implementation in :cite:`Cohen:2003b`. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + tokenizer=None, + intersection_type='crisp', + simplified=False, + mismatch_factor=0.5, + **kwargs + ): + """Initialize FellegiSunter instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + simplified : bool + Specifies to use the simplified scoring variant + mismatch_factor : float + Specifies the penalty factor for mismatches + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(FellegiSunter, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + self._simplified = simplified + self._mismatch_factor = mismatch_factor + + def sim_score(self, src, tar): + """Return the Fellegi-Sunter similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Fellegi-Sunter similarity + + Examples + -------- + >>> cmp = FellegiSunter() + >>> cmp.sim_score('cat', 'hat') + 0.8803433378011485 + >>> cmp.sim_score('Niall', 'Neil') + 0.6958768466635681 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.45410905865149187 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + src_tokens, tar_tokens = self._get_tokens() + + src_total = sum(src_tokens.values()) + tar_total = sum(tar_tokens.values()) + src_unique = len(src_tokens) + tar_unique = len(tar_tokens) + + similarity = 0.0 + for _tok, count in self._intersection().items(): + if self._simplified: + similarity += -log(count / tar_total) + else: + prob = count / tar_total + similarity -= log( + 1 + + float_info.epsilon + - exp( + src_unique + * tar_unique + * log(1 + float_info.epsilon - prob * prob) + ) + ) + + for _tok, count in self._src_only().items(): + if self._simplified: + similarity -= -log(count / src_total) * self._mismatch_factor + + return similarity + + def sim(self, src, tar): + """Return the normalized Fellegi-Sunter similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Fellegi-Sunter similarity + + Examples + -------- + >>> cmp = FellegiSunter() + >>> cmp.sim('cat', 'hat') + 0.2934477792670495 + >>> cmp.sim('Niall', 'Neil') + 0.13917536933271363 + >>> cmp.sim('aluminum', 'Catalan') + 0.056763632331436484 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + score = self.sim_score(src, tar) + if score == 0.0: + return 0.0 + if self._simplified: + return max(0.0, score / (len(src) + len(tar))) + return max(0.0, score / max(len(src), len(tar))) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fidelity.py b/abydos/distance/_fidelity.py new file mode 100644 index 000000000..da7ede2ab --- /dev/null +++ b/abydos/distance/_fidelity.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fidelity. + +Fidelity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Fidelity'] + + +class Fidelity(_TokenDistance): + r"""Fidelity. + + For two multisets X and Y drawn from an alphabet S, fidelity is + + .. math:: + + sim_{Fidelity}(X, Y) = + \Bigg( \sum_{i \in S} \sqrt{|\frac{A_i}{|A|} \cdot + \frac{B_i}{|B|}|} \Bigg)^2 + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Fidelity instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Fidelity, self).__init__(tokenizer=tokenizer, **kwargs) + + def sim(self, src, tar): + """Return the fidelity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + fidelity + + Examples + -------- + >>> cmp = Fidelity() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.1333333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.013888888888888888 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._total().keys() + src_mag = max(1, sum(self._src_tokens.values())) + tar_mag = max(1, sum(self._tar_tokens.values())) + + return ( + sum( + ( + abs( + self._src_tokens[tok] + / src_mag + * self._tar_tokens[tok] + / tar_mag + ) + ) + ** 0.5 + for tok in alphabet + ) + ** 2 + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fleiss.py b/abydos/distance/_fleiss.py new file mode 100644 index 000000000..948897225 --- /dev/null +++ b/abydos/distance/_fleiss.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fleiss. + +Fleiss correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Fleiss'] + + +class Fleiss(_TokenDistance): + r"""Fleiss correlation. + + For two sets X and Y and a population N, Fleiss correlation + :cite:`Fleiss:1975` is + + .. math:: + + corr_{Fleiss}(X, Y) = + \frac{(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|) \cdot + (|X| \cdot |N \setminus X| + |Y| \cdot |N \setminus Y|)} + {2 \cdot |X| \cdot |N \setminus X| \cdot |Y| \cdot |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Fleiss} = + \frac{(ad-bc)((a+b)(c+d)+(a+c)(b+d))}{2(a+b)(c+d)(a+c)(b+d)} + + This is Fleiss' :math:`M(A_1)`, :math:`ad-bc` divided by the harmonic mean + of the marginals :math:`p_1q_1 = (a+b)(c+d)` and + :math:`p_2q_2 = (a+c)(b+d)`. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Fleiss instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Fleiss, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Fleiss correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Fleiss correlation + + Examples + -------- + >>> cmp = Fleiss() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.3621712520061204 + >>> cmp.corr('aluminum', 'Catalan') + 0.10839724112919989 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = (a * d - b * c) * ((a + b) * (c + d) + (a + c) * (b + d)) + + if num == 0.0: + return 0.0 + return num / (2.0 * (a + b) * (c + d) * (a + c) * (b + d)) + + def sim(self, src, tar): + """Return the Fleiss similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Fleiss similarity + + Examples + -------- + >>> cmp = Fleiss() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6810856260030602 + >>> cmp.sim('aluminum', 'Catalan') + 0.5541986205645999 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fleiss_levin_paik.py b/abydos/distance/_fleiss_levin_paik.py new file mode 100644 index 000000000..e603b9c54 --- /dev/null +++ b/abydos/distance/_fleiss_levin_paik.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fleiss_levin_paik. + +Fleiss-Levin-Paik similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['FleissLevinPaik'] + + +class FleissLevinPaik(_TokenDistance): + r"""Fleiss-Levin-Paik similarity. + + For two sets X and Y and a population N, Fleiss-Levin-Paik similarity + :cite:`Fleiss:2003` is + + .. math:: + + sim_{FleissLevinPaik}(X, Y) = + \frac{2|(N \setminus X) \setminus Y|} + {2|(N \setminus X) \setminus Y| + + |X \setminus Y| + |Y \setminus X|} + + This is :cite:`Morris:2012`'s 'd Specific Agreement'. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{FleissLevinPaik} = + \frac{2d}{2d + b + c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize FleissLevinPaik instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(FleissLevinPaik, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Fleiss-Levin-Paik similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Fleiss-Levin-Paik similarity + + Examples + -------- + >>> cmp = FleissLevinPaik() + >>> cmp.sim('cat', 'hat') + 0.9974358974358974 + >>> cmp.sim('Niall', 'Neil') + 0.9955041746949261 + >>> cmp.sim('aluminum', 'Catalan') + 0.9903412749517064 + >>> cmp.sim('ATCG', 'TAGC') + 0.993581514762516 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if d == 0.0: + return 0.0 + return 2 * d / (2 * d + b + c) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_flexmetric.py b/abydos/distance/_flexmetric.py new file mode 100644 index 000000000..635aac708 --- /dev/null +++ b/abydos/distance/_flexmetric.py @@ -0,0 +1,243 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._flexmetric. + +FlexMetric distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from numpy import float as np_float +from numpy import zeros as np_zeros + +from ._distance import _Distance + +__all__ = ['FlexMetric'] + + +class FlexMetric(_Distance): + r"""FlexMetric distance. + + FlexMetric distance :cite:`Kempken:2005` + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, normalizer=max, indel_costs=None, subst_costs=None, **kwargs + ): + """Initialize FlexMetric instance. + + Parameters + ---------- + normalizer : function + A function that takes an list and computes a normalization term + by which the edit distance is divided (max by default). Another + good option is the sum function. + indel_costs : list of tuples + A list of insertion and deletion costs. Each list element should + be a tuple consisting of an iterable (sets are best) and a float + value. The iterable consists of those letters whose insertion + or deletion has a cost equal to the float value. + subst_costs : list of tuples + A list of substitution costs. Each list element should + be a tuple consisting of an iterable (sets are best) and a float + value. The iterable consists of the letters in each letter class, + which may be substituted for each other at cost equal to the float + value. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(FlexMetric, self).__init__(**kwargs) + self._normalizer = normalizer + + if indel_costs is None: + self._indel_costs = [ + (frozenset('dtch'), 0.4), + (frozenset('e'), 0.5), + (frozenset('u'), 0.9), + (frozenset('rpn'), 0.95), + ] + else: + self._indel_costs = indel_costs + + def _get_second(s): + return s[1] + + if subst_costs is None: + self._subst_costs = [ + (frozenset('szß'), 0.1), + (frozenset('dt'), 0.1), + (frozenset('iy'), 0.1), + (frozenset('ckq'), 0.1), + (frozenset('eä'), 0.1), + (frozenset('uüv'), 0.1), + (frozenset('iü'), 0.1), + (frozenset('fv'), 0.1), + (frozenset('zc'), 0.1), + (frozenset('ij'), 0.1), + (frozenset('bp'), 0.1), + (frozenset('eoö'), 0.2), + (frozenset('aä'), 0.2), + (frozenset('mbp'), 0.4), + (frozenset('uw'), 0.4), + (frozenset('uo'), 0.8), + (frozenset('aeiouy'), 0.9), + ] + else: + self._subst_costs = sorted(subst_costs, key=_get_second) + + def _cost(self, src, s_pos, tar, t_pos): + if s_pos == -1: + if t_pos > 0 and tar[t_pos - 1] == tar[t_pos]: + return 0.0 + for letter_set in self._indel_costs: + if tar[t_pos] in letter_set[0]: + return letter_set[1] + else: + return 1.0 + elif t_pos == -1: + if s_pos > 0 and src[s_pos - 1] == src[s_pos]: + return 0.0 + for letter_set in self._indel_costs: + if src[s_pos] in letter_set[0]: + return letter_set[1] + else: + return 1.0 + for letter_set in self._subst_costs: + if src[s_pos] in letter_set[0] and tar[t_pos] in letter_set[0]: + return letter_set[1] + else: + return 1.0 + + def dist_abs(self, src, tar): + """Return the FlexMetric distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + FlexMetric distance + + Examples + -------- + >>> cmp = FlexMetric() + >>> cmp.dist_abs('cat', 'hat') + 0.8 + >>> cmp.dist_abs('Niall', 'Neil') + 1.5 + >>> cmp.dist_abs('aluminum', 'Catalan') + 6.7 + >>> cmp.dist_abs('ATCG', 'TAGC') + 2.1999999999999997 + + + .. versionadded:: 0.4.0 + + """ + src_len = len(src) + tar_len = len(tar) + + if src == tar: + return 0 + if not src: + return sum(self._cost('', -1, tar, j) for j in range(len(tar))) + if not tar: + return sum(self._cost(src, i, '', -1) for i in range(len(src))) + + d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) + for i in range(1, src_len + 1): + d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(src, i - 1, '', -1) + for j in range(1, tar_len + 1): + d_mat[0, j] = d_mat[0, j - 1] + self._cost('', -1, tar, j - 1) + + src_lc = src.lower() + tar_lc = tar.lower() + + for i in range(src_len): + for j in range(tar_len): + d_mat[i + 1, j + 1] = min( + d_mat[i + 1, j] + self._cost('', -1, tar_lc, j), # ins + d_mat[i, j + 1] + self._cost(src_lc, i, '', -1), # del + d_mat[i, j] + + ( + self._cost(src_lc, i, tar_lc, j) + if src[i] != tar[j] + else 0 + ), # sub/== + ) + + return d_mat[src_len, tar_len] + + def dist(self, src, tar): + """Return the normalized FlexMetric distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Normalized FlexMetric distance + + Examples + -------- + >>> cmp = FlexMetric() + >>> cmp.dist('cat', 'hat') + 0.26666666666666666 + >>> cmp.dist('Niall', 'Neil') + 0.3 + >>> cmp.dist('aluminum', 'Catalan') + 0.8375 + >>> cmp.dist('ATCG', 'TAGC') + 0.5499999999999999 + + + .. versionadded:: 0.4.0 + + """ + score = self.dist_abs(src, tar) + if score: + return score / self._normalizer([len(src), len(tar)]) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_forbes_i.py b/abydos/distance/_forbes_i.py new file mode 100644 index 000000000..50a9c7101 --- /dev/null +++ b/abydos/distance/_forbes_i.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._forbes_i. + +Forbes I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['ForbesI'] + + +class ForbesI(_TokenDistance): + r"""Forbes I similarity. + + For two sets X and Y and a population N, the Forbes I + similarity :cite:`Forbes:1907,Mozley:1936` is + + .. math:: + + sim_{ForbesI}(X, Y) = + \frac{|N| \cdot |X \cap Y|}{|X| \cdot |Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{ForbesI} = + \frac{na}{(a+b)(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ForbesI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ForbesI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Forbes I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Forbes I similarity + + Examples + -------- + >>> cmp = ForbesI() + >>> cmp.sim_score('cat', 'hat') + 98.0 + >>> cmp.sim_score('Niall', 'Neil') + 52.266666666666666 + >>> cmp.sim_score('aluminum', 'Catalan') + 10.902777777777779 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + n = self._population_unique_card() + apb = self._src_card() + apc = self._tar_card() + + num = n * a + if num: + return num / (apb * apc) + return 0.0 + + def sim(self, src, tar): + """Return the normalized Forbes I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Forbes I similarity + + Examples + -------- + >>> cmp = ForbesI() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3333333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.11125283446712018 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + return self.sim_score(src, tar) / max( + self.sim_score(src, src), self.sim_score(tar, tar) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_forbes_ii.py b/abydos/distance/_forbes_ii.py new file mode 100644 index 000000000..e5bd3b513 --- /dev/null +++ b/abydos/distance/_forbes_ii.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._forbes_ii. + +Forbes II correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['ForbesII'] + + +class ForbesII(_TokenDistance): + r"""Forbes II correlation. + + For two sets X and Y and a population N, the Forbes II correlation, + as described in :cite:`Forbes:1925`, is + + .. math:: + + corr_{ForbesII}(X, Y) = + \frac{|X \setminus Y| \cdot |Y \setminus X| - + |X \cap Y| \cdot |(N \setminus X) \setminus Y|} + {|X| \cdot |Y| - |N| \cdot min(|X|, |Y|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{ForbesII} = + \frac{bc-ad}{(a+b)(a+c) - n \cdot min(a+b, a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ForbesII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ForbesII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Forbes II correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Forbes II correlation + + Examples + -------- + >>> cmp = ForbesII() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.3953727506426735 + >>> cmp.corr('aluminum', 'Catalan') + 0.11485180412371133 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + apb = self._src_card() + apc = self._tar_card() + n = self._population_unique_card() + + num = n * a - apb * apc + if num: + return num / (n * min(apb, apc) - apb * apc) + return 0.0 + + def sim(self, src, tar): + """Return the Forbes II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Forbes II similarity + + Examples + -------- + >>> cmp = ForbesII() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6976863753213367 + >>> cmp.sim('aluminum', 'Catalan') + 0.5574259020618557 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fossum.py b/abydos/distance/_fossum.py new file mode 100644 index 000000000..99b27c345 --- /dev/null +++ b/abydos/distance/_fossum.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fossum. + +Fossum similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Fossum'] + + +class Fossum(_TokenDistance): + r"""Fossum similarity. + + For two sets X and Y and a population N, the Fossum similarity + :cite:`Fossum:1966` is + + .. math:: + + sim_{Fossum}(X, Y) = + \frac{|N| \cdot \Big(|X \cap Y|-\frac{1}{2}\Big)^2}{|X| \cdot |Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Fossum} = + \frac{n(a-\frac{1}{2})^2}{(a+b)(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Fossum instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Fossum, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Fossum similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Fossum similarity + + Examples + -------- + >>> cmp = Fossum() + >>> cmp.sim_score('cat', 'hat') + 110.25 + >>> cmp.sim_score('Niall', 'Neil') + 58.8 + >>> cmp.sim_score('aluminum', 'Catalan') + 2.7256944444444446 + >>> cmp.sim_score('ATCG', 'TAGC') + 7.84 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + n = self._population_unique_card() + a = self._intersection_card() + apb = max(1.0, self._src_card()) + apc = max(1.0, self._tar_card()) + + num = n * (a - 0.5) ** 2 + if num: + return num / (apb * apc) + return 0.0 + + def sim(self, src, tar): + """Return the normalized Fossum similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Fossum similarity + + Examples + -------- + >>> cmp = Fossum() + >>> cmp.sim('cat', 'hat') + 0.1836734693877551 + >>> cmp.sim('Niall', 'Neil') + 0.08925619834710742 + >>> cmp.sim('aluminum', 'Catalan') + 0.0038927335640138415 + >>> cmp.sim('ATCG', 'TAGC') + 0.01234567901234568 + + + .. versionadded:: 0.4.0 + + """ + num = self.sim_score(src, tar) + if num: + return num / max( + self.sim_score(src, src), self.sim_score(tar, tar) + ) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fuzzywuzzy_partial_string.py b/abydos/distance/_fuzzywuzzy_partial_string.py new file mode 100644 index 000000000..28009b403 --- /dev/null +++ b/abydos/distance/_fuzzywuzzy_partial_string.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fuzzywuzzy_partial_string. + +FuzzyWuzzy Partial String similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from difflib import SequenceMatcher + +from ._distance import _Distance + +__all__ = ['FuzzyWuzzyPartialString'] + + +class FuzzyWuzzyPartialString(_Distance): + """FuzzyWuzzy Partial String similarity. + + This follows the FuzzyWuzzy Partial String similarity algorithm + :cite:`Cohen:2011`. Rather than returning an integer in the range [0, 100], + as demonstrated in the blog post, this implementation returns a float in + the range [0.0, 1.0]. + + .. versionadded:: 0.4.0 + """ + + def sim(self, src, tar): + """Return the FuzzyWuzzy Partial String similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + FuzzyWuzzy Partial String similarity + + Examples + -------- + >>> cmp = FuzzyWuzzyPartialString() + >>> round(cmp.sim('cat', 'hat'), 12) + 0.666666666667 + >>> round(cmp.sim('Niall', 'Neil'), 12) + 0.75 + >>> round(cmp.sim('aluminum', 'Catalan'), 12) + 0.428571428571 + >>> cmp.sim('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + max_sim = 0.0 + start_pos = 0 + + if len(src) > len(tar): + src, tar = tar, src + + src_len = len(src) + + while max_sim < 1.0 and start_pos < len(tar) - src_len + 1: + max_sim = max( + max_sim, + SequenceMatcher( + None, src, tar[start_pos : start_pos + src_len] + ).ratio(), + ) + start_pos += 1 + + return max_sim + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fuzzywuzzy_token_set.py b/abydos/distance/_fuzzywuzzy_token_set.py new file mode 100644 index 000000000..a8b28612f --- /dev/null +++ b/abydos/distance/_fuzzywuzzy_token_set.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fuzzywuzzy_token_set. + +FuzzyWuzzy Token Set similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from difflib import SequenceMatcher + + +from ._token_distance import _TokenDistance +from ..tokenizer import RegexpTokenizer + +__all__ = ['FuzzyWuzzyTokenSet'] + + +class FuzzyWuzzyTokenSet(_TokenDistance): + r"""FuzzyWuzzy Token Set similarity. + + This follows the FuzzyWuzzy Token Set similarity algorithm + :cite:`Cohen:2011`. Rather than returning an integer in the range [0, 100], + as demonstrated in the blog post, this implementation returns a float in + the range [0.0, 1.0]. Distinct from the + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize FuzzyWuzzyTokenSet instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package. + By default, the regexp tokenizer is employed, matching only + letters. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + if tokenizer is None: + tokenizer = RegexpTokenizer() + super(FuzzyWuzzyTokenSet, self).__init__(tokenizer=tokenizer, **kwargs) + + def sim(self, src, tar): + """Return the FuzzyWuzzy Token Set similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + FuzzyWuzzy Token Set similarity + + Examples + -------- + >>> cmp = FuzzyWuzzyTokenSet() + >>> cmp.sim('cat', 'hat') + 0.75 + >>> cmp.sim('Niall', 'Neil') + 0.7272727272727273 + >>> cmp.sim('aluminum', 'Catalan') + 0.47058823529411764 + >>> cmp.sim('ATCG', 'TAGC') + 0.6 + + + .. versionadded:: 0.4.0 + + """ + src = self.params['tokenizer'].tokenize(src).get_set() + tar = self.params['tokenizer'].tokenize(tar).get_set() + + intersection = src & tar + src -= intersection + tar -= intersection + + intersection = ' '.join(sorted(intersection)) + ' ' + src = intersection + ' '.join(sorted(src)) + tar = intersection + ' '.join(sorted(tar)) + + return max( + SequenceMatcher(None, src, intersection).ratio(), + SequenceMatcher(None, intersection, tar).ratio(), + SequenceMatcher(None, src, tar).ratio(), + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_fuzzywuzzy_token_sort.py b/abydos/distance/_fuzzywuzzy_token_sort.py new file mode 100644 index 000000000..c8566a385 --- /dev/null +++ b/abydos/distance/_fuzzywuzzy_token_sort.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._fuzzywuzzy_token_sort. + +FuzzyWuzzy Token Sort similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from difflib import SequenceMatcher + +from ._token_distance import _TokenDistance +from ..tokenizer import RegexpTokenizer + +__all__ = ['FuzzyWuzzyTokenSort'] + + +class FuzzyWuzzyTokenSort(_TokenDistance): + r"""FuzzyWuzzy Token Sort similarity. + + This follows the FuzzyWuzzy Token Sort similarity algorithm + :cite:`Cohen:2011`. Rather than returning an integer in the range [0, 100], + as demonstrated in the blog post, this implementation returns a float in + the range [0.0, 1.0]. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize FuzzyWuzzyTokenSort instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package. + By default, the regexp tokenizer is employed, matching only + letters. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + if tokenizer is None: + tokenizer = RegexpTokenizer() + super(FuzzyWuzzyTokenSort, self).__init__( + tokenizer=tokenizer, **kwargs + ) + + def sim(self, src, tar): + """Return the FuzzyWuzzy Token Sort similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + FuzzyWuzzy Token Sort similarity + + Examples + -------- + >>> cmp = FuzzyWuzzyTokenSort() + >>> cmp.sim('cat', 'hat') + 0.6666666666666666 + >>> cmp.sim('Niall', 'Neil') + 0.6666666666666666 + >>> cmp.sim('aluminum', 'Catalan') + 0.4 + >>> cmp.sim('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + src = ' '.join( + sorted(self.params['tokenizer'].tokenize(src).get_list()) + ) + tar = ' '.join( + sorted(self.params['tokenizer'].tokenize(tar).get_list()) + ) + + return SequenceMatcher(None, src, tar).ratio() + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_generalized_fleiss.py b/abydos/distance/_generalized_fleiss.py new file mode 100644 index 000000000..493b939a5 --- /dev/null +++ b/abydos/distance/_generalized_fleiss.py @@ -0,0 +1,325 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._generalized_fleiss. + +Generalized Fleiss correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance +from ..stats._mean import ( + aghmean, + agmean, + amean, + cmean, + ghmean, + gmean, + heronian_mean, + hmean, + hoelder_mean, + imean, + lehmer_mean, + lmean, + qmean, + seiffert_mean, +) + +__all__ = ['GeneralizedFleiss'] + + +def _agmean_prec6(l): + return agmean(l, prec=6) + + +def _ghmean_prec6(l): + return ghmean(l, prec=6) + + +def _aghmean_prec6(l): + return aghmean(l, prec=6) + + +means = { + 'arithmetic': amean, + 'geometric': gmean, + 'harmonic': hmean, + 'ag': _agmean_prec6, + 'gh': _ghmean_prec6, + 'agh': _aghmean_prec6, + 'contraharmonic': cmean, + 'identric': imean, + 'logarithmic': lmean, + 'quadratic': qmean, + 'heronian': heronian_mean, + 'hoelder': hoelder_mean, + 'lehmer': lehmer_mean, + 'seiffert': seiffert_mean, +} + + +class GeneralizedFleiss(_TokenDistance): + r"""Generalized Fleiss correlation. + + For two sets X and Y and a population N, Generalized Fleiss correlation + is based on observations from :cite:`Fleiss:1975`. + + .. math:: + + corr_{GeneralizedFleiss}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {\mu_{products~of~marginals}} + + The mean function :math:`\mu` may be any of the mean functions in + :py:mod:`abydos.stats`. The products of marginals may be one of the + following: + + - ``a`` : :math:`|X| \cdot |N \setminus X|` & + :math:`|Y| \cdot |N \setminus Y|` + - ``b`` : :math:`|X| \cdot |Y|` & + :math:`|N \setminus X| \cdot |N \setminus Y|` + - ``c`` : :math:`|X| \cdot |N| \setminus Y|` & + :math:`|Y| \cdot |N \setminus X|` + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{GeneralizedFleiss} = + \frac{ad-bc}{\mu_{products~of~marginals}} + + And the products of marginals are: + + - ``a`` : :math:`p_1q_1 = (a+b)(c+d)` & :math:`p_2q_2 = (a+c)(b+d)` + - ``b`` : :math:`p_1p_2 = (a+b)(a+c)` & :math:`q_1q_2 = (c+d)(b+d)` + - ``c`` : :math:`p_1q_2 = (a+b)(b+d)` & :math:`p_2q_1 = (a+c)(c+d)` + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + mean_func='arithmetic', + marginals='a', + proportional=False, + **kwargs + ): + """Initialize GeneralizedFleiss instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + mean_func : str or function + Specifies the mean function to use. A function taking a list of + numbers as its only required argument may be supplied, or one of + the following strings will select the specified mean function from + :py:mod:`abydos.stats`: + + - ``arithmetic`` employs :py:func:`amean`, and this measure + will be identical to :py:class:`MaxwellPilliner` with + otherwise default parameters + - ``geometric`` employs :py:func:`gmean`, and this measure + will be identical to :py:class:`PearsonPhi` with otherwise + default parameters + - ``harmonic`` employs :py:func:`hmean`, and this measure + will be identical to :py:class:`Fleiss` with otherwise + default parameters + - ``ag`` employs the arithmetic-geometric mean + :py:func:`agmean` + - ``gh`` employs the geometric-harmonic mean + :py:func:`ghmean` + - ``agh`` employs the arithmetic-geometric-harmonic mean + :py:func:`aghmean` + - ``contraharmonic`` employs the contraharmonic mean + :py:func:`cmean` + - ``identric`` employs the identric mean :py:func:`imean` + - ``logarithmic`` employs the logarithmic mean + :py:func:`lmean` + - ``quadratic`` employs the quadratic mean :py:func:`qmean` + - ``heronian`` employs the Heronian mean + :py:func:`heronian_mean` + - ``hoelder`` employs the Hölder mean :py:func:`hoelder_mean` + - ``lehmer`` employs the Lehmer mean :py:func:`lehmer_mean` + - ``seiffert`` employs Seiffert's mean + :py:func:`seiffert_mean` + marginals : str + Specifies the pairs of marginals to multiply and calculate the + resulting mean of. Can be: + + - ``a`` : :math:`p_1q_1 = (a+b)(c+d)` & + :math:`p_2q_2 = (a+c)(b+d)` + - ``b`` : :math:`p_1p_2 = (a+b)(a+c)` & + :math:`q_1q_2 = (c+d)(b+d)` + - ``c`` : :math:`p_1q_2 = (a+b)(b+d)` & + :math:`p_2q_1 = (a+c)(c+d)` + proportional : bool + If true, each of the values, :math:`a, b, c, d` and the marginals + will be divided by the total :math:`a+b+c+d=n`. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + self.mean_func = mean_func + self.marginals = marginals + self.proportional = proportional + + super(GeneralizedFleiss, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Generalized Fleiss correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Generalized Fleiss correlation + + Examples + -------- + >>> cmp = GeneralizedFleiss() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.35921989956790845 + >>> cmp.corr('aluminum', 'Catalan') + 0.10803030303030303 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + if self.proportional: + a /= n + b /= n + c /= n + d /= n + + num = a * d - b * c + if not num: + return 0.0 + + if self.marginals == 'b': + mps = [(a + b) * (a + c), (c + d) * (b + d)] + elif self.marginals == 'c': + mps = [(a + b) * (b + d), (a + c) * (c + d)] + else: + mps = [(a + b) * (c + d), (a + c) * (b + d)] + + mean_value = ( + self.mean_func(mps) + if callable(self.mean_func) + else means[self.mean_func](mps) + ) + + return num / mean_value + + def sim(self, src, tar): + """Return the Generalized Fleiss similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Generalized Fleiss similarity + + Examples + -------- + >>> cmp = GeneralizedFleiss() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6796099497839543 + >>> cmp.sim('aluminum', 'Catalan') + 0.5540151515151515 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_gilbert.py b/abydos/distance/_gilbert.py new file mode 100644 index 000000000..68d9cbba6 --- /dev/null +++ b/abydos/distance/_gilbert.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._gilbert. + +Gilbert correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Gilbert'] + + +class Gilbert(_TokenDistance): + r"""Gilbert correlation. + + For two sets X and Y and a population N, the Gilbert correlation + :cite:`Gilbert:1884` is + + .. math:: + + corr_{Gilbert}(X, Y) = + \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)} + {|N|^2 - |X \cap Y|^2 + |X \setminus Y|^2 + |Y \setminus X|^2 - + |(N \setminus X) \setminus Y|^2} + + For lack of access to the original, this formula is based on the concurring + formulae presented in :cite:`Peirce:1884` and :cite:`Doolittle:1884`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Gilbert} = + \frac{2(ad-cd)}{n^2-a^2+b^2+c^2-d^2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Gilbert instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Gilbert, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Gilbert correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gilbert correlation + + Examples + -------- + >>> cmp = Gilbert() + >>> cmp.corr('cat', 'hat') + 0.3310580204778157 + >>> cmp.corr('Niall', 'Neil') + 0.21890122402504983 + >>> cmp.corr('aluminum', 'Catalan') + 0.057094811018577836 + >>> cmp.corr('ATCG', 'TAGC') + -0.003198976327575176 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + num = a * n - (a + b) * (a + c) + if num: + return num / (n * (a + b + c) - (a + b) * (a + c)) + return 0.0 + + def sim(self, src, tar): + """Return the Gilbert similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gilbert similarity + + Examples + -------- + >>> cmp = Gilbert() + >>> cmp.sim('cat', 'hat') + 0.6655290102389079 + >>> cmp.sim('Niall', 'Neil') + 0.6094506120125249 + >>> cmp.sim('aluminum', 'Catalan') + 0.5285474055092889 + >>> cmp.sim('ATCG', 'TAGC') + 0.4984005118362124 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_gilbert_wells.py b/abydos/distance/_gilbert_wells.py new file mode 100644 index 000000000..01638a349 --- /dev/null +++ b/abydos/distance/_gilbert_wells.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._gilbert_wells. + +Gilbert & Wells similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import factorial, log, pi +from sys import float_info + +from ._token_distance import _TokenDistance + +__all__ = ['GilbertWells'] + +_epsilon = float_info.epsilon + + +class GilbertWells(_TokenDistance): + r"""Gilbert & Wells similarity. + + For two sets X and Y and a population N, the Gilbert & Wells + similarity :cite:`Gilbert:1966` is + + .. math:: + + sim_{GilbertWells}(X, Y) = + ln \frac{|N|^3}{2\pi |X| \cdot |Y| \cdot + |N \setminus Y| \cdot |N \setminus X|} + 2ln + \frac{|N|! \cdot |X \cap Y|! \cdot |X \setminus Y|! \cdot + |Y \setminus X|! \cdot |(N \setminus X) \setminus Y|!} + {|X|! \cdot |Y|! \cdot |N \setminus Y|! \cdot |N \setminus X|!} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{GilbertWells} = + ln \frac{n^3}{2\pi (a+b)(a+c)(b+d)(c+d)} + + 2ln \frac{n!a!b!c!d!}{(a+b)!(a+c)!(b+d)!(c+d)!} + + Notes + ----- + Most lists of similarity & distance measures, including + :cite:`Hubalek:1982,Choi:2010,Morris:2012` have a quite different formula, + which would be :math:`ln~a - ln~b - ln \frac{a+b}{n} - ln \frac{a+c}{n} = + ln\frac{an}{(a+b)(a+c)}`. However, neither this formula nor anything + similar or equivalent to it appears anywhere within the cited work, + :cite:`Gilbert:1966`. See :class:``UnknownF`` for this, alternative, + measure. + + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, alphabet=None, tokenizer=None, **kwargs): + """Initialize GilbertWells instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(GilbertWells, self).__init__( + alphabet=alphabet, tokenizer=tokenizer, **kwargs + ) + + def sim_score(self, src, tar): + """Return the Gilbert & Wells similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gilbert & Wells similarity + + Examples + -------- + >>> cmp = GilbertWells() + >>> cmp.sim_score('cat', 'hat') + 20.17617447734673 + >>> cmp.sim_score('Niall', 'Neil') + 16.717742356982733 + >>> cmp.sim_score('aluminum', 'Catalan') + 5.495096667524002 + >>> cmp.sim_score('ATCG', 'TAGC') + 1.6845961909440712 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return log( + max( + _epsilon, + n ** 3 + / ( + 2 + * pi + * max(_epsilon, a + b) + * max(_epsilon, a + c) + * max(_epsilon, b + d) + * max(_epsilon, c + d) + ), + ) + ) + 2 * ( + log(factorial(n)) + + log(factorial(a)) + + log(factorial(b)) + + log(factorial(c)) + + log(factorial(d)) + - log(factorial(a + b)) + - log(factorial(a + c)) + - log(factorial(b + d)) + - log(factorial(c + d)) + ) + + def sim(self, src, tar): + """Return the normalized Gilbert & Wells similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Gilbert & Wells similarity + + Examples + -------- + >>> cmp = GilbertWells() + >>> cmp.sim('cat', 'hat') + 0.4116913723876516 + >>> cmp.sim('Niall', 'Neil') + 0.2457247406857589 + >>> cmp.sim('aluminum', 'Catalan') + 0.05800001636414742 + >>> cmp.sim('ATCG', 'TAGC') + 0.028716013247135602 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + norm = max(self.sim_score(src, src), self.sim_score(tar, tar)) + return self.sim_score(src, tar) / norm + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_gini_i.py b/abydos/distance/_gini_i.py new file mode 100644 index 000000000..5903dbf5e --- /dev/null +++ b/abydos/distance/_gini_i.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._gini_i. + +Gini I correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from sys import float_info + +from ._token_distance import _TokenDistance + +__all__ = ['GiniI'] + +_epsilon = float_info.epsilon + + +class GiniI(_TokenDistance): + r"""Gini I correlation. + + For two sets X and Y and a population N, Gini I correlation + :cite:`Gini:1912`, using the formula from :cite:`Goodman:1959`, is + + .. math:: + + corr_{GiniI}(X, Y) = + \frac{\frac{|X \cap Y|+|(N \setminus X) \setminus Y|}{|N|} - + \frac{|X| \cdot |Y|}{|N|} + + \frac{|N \setminus Y| \cdot |N \setminus X|}{|N|}} + {\sqrt{(1-(\frac{|X|}{|N|}^2+\frac{|Y|}{|N|}^2)) \cdot + (1-(\frac{|N \setminus Y|}{|N|}^2 + + \frac{|N \setminus X|}{|N|}^2))}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + after each term has been converted to a proportion by dividing by n, this + is + + .. math:: + + corr_{GiniI} = + \frac{(a+d)-(a+b)(a+c) + (b+d)(c+d)} + {\sqrt{(1-((a+b)^2+(c+d)^2))\cdot(1-((a+c)^2+(b+d)^2))}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + normalizer='proportional', + **kwargs + ): + """Initialize GiniI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + normalizer : str + Specifies the normalization type. See :ref:`normalizer ` + description in :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GiniI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + normalizer=normalizer, + **kwargs + ) + + def corr(self, src, tar): + """Return the Gini I correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gini I correlation + + Examples + -------- + >>> cmp = GiniI() + >>> cmp.corr('cat', 'hat') + 0.49722814498933254 + >>> cmp.corr('Niall', 'Neil') + 0.39649090262533215 + >>> cmp.corr('aluminum', 'Catalan') + 0.14887105223941113 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237489576 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + return ((a + d) - ((a + b) * (a + c) + (c + d) * (b + d))) / ( + (1 + _epsilon - ((a + b) ** 2 + (c + d) ** 2)) + * (1 + _epsilon - ((a + c) ** 2 + (b + d) ** 2)) + ) ** 0.5 + + def sim(self, src, tar): + """Return the normalized Gini I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Gini I similarity + + Examples + -------- + >>> cmp = GiniI() + >>> cmp.sim('cat', 'hat') + 0.7486140724946663 + >>> cmp.sim('Niall', 'Neil') + 0.6982454513126661 + >>> cmp.sim('aluminum', 'Catalan') + 0.5744355261197056 + >>> cmp.sim('ATCG', 'TAGC') + 0.4967907573812552 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_gini_ii.py b/abydos/distance/_gini_ii.py new file mode 100644 index 000000000..9c48e848a --- /dev/null +++ b/abydos/distance/_gini_ii.py @@ -0,0 +1,206 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._gini_ii. + +Gini II correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from sys import float_info + +from ._token_distance import _TokenDistance + +__all__ = ['GiniII'] + +_epsilon = float_info.epsilon + + +class GiniII(_TokenDistance): + r"""Gini II distance. + + For two sets X and Y and a population N, Gini II correlation + :cite:`Gini:1915`, using the formula from :cite:`Goodman:1959`, is + + .. math:: + + corr_{GiniII}(X, Y) = + \frac{\frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|} - + (\frac{|X| \cdot |Y|}{|N|} + + \frac{|N \setminus Y| \cdot |N \setminus X|}{|N|})} + {1 - |\frac{|Y \setminus X| - |X \setminus Y|}{|N|}| + - (\frac{|X| \cdot |Y|}{|N|} + + \frac{|N \setminus Y| \cdot |N \setminus X|}{|N|})} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + after each term has been converted to a proportion by dividing by n, this + is + + .. math:: + + corr_{GiniII} = + \frac{(a+d) - ((a+b)(a+c) + (b+d)(c+d))} + {1 - |b-c| - ((a+b)(a+c) + (b+d)(c+d))} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + normalizer='proportional', + **kwargs + ): + """Initialize GiniII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + normalizer : str + Specifies the normalization type. See :ref:`normalizer ` + description in :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GiniII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + normalizer=normalizer, + **kwargs + ) + + def corr(self, src, tar): + """Return the Gini II correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gini II correlation + + Examples + -------- + >>> cmp = GiniII() + >>> cmp.corr('cat', 'hat') + 0.49722814498933254 + >>> cmp.corr('Niall', 'Neil') + 0.4240703425535771 + >>> cmp.corr('aluminum', 'Catalan') + 0.15701415701415936 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237489576 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + return ((a + d) - ((a + b) * (a + c) + (c + d) * (b + d))) / ( + ( + 1 + + _epsilon + - abs(b - c) + - ((a + b) * (a + c) + (c + d) * (b + d)) + ) + ) + + def sim(self, src, tar): + """Return the normalized Gini II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Gini II similarity + + Examples + -------- + >>> cmp = GiniII() + >>> cmp.sim('cat', 'hat') + 0.7486140724946663 + >>> cmp.sim('Niall', 'Neil') + 0.7120351712767885 + >>> cmp.sim('aluminum', 'Catalan') + 0.5785070785070797 + >>> cmp.sim('ATCG', 'TAGC') + 0.4967907573812552 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_goodall.py b/abydos/distance/_goodall.py new file mode 100644 index 000000000..cea88f467 --- /dev/null +++ b/abydos/distance/_goodall.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._goodall. + +Goodall similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import asin, pi + +from ._token_distance import _TokenDistance + +__all__ = ['Goodall'] + + +class Goodall(_TokenDistance): + r"""Goodall similarity. + + For two sets X and Y and a population N, Goodall similarity + :cite:`Goodall:1967,Austin:1977` is an angular transformation of Sokal + & Michener's simple matching coefficient + + .. math:: + + sim_{Goodall}(X, Y) = \frac{2}{\pi} \sin^{-1}\Big( + \sqrt{\frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}} + \Big) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Goodall} =\frac{2}{\pi} \sin^{-1}\Big( + \sqrt{\frac{a + d}{n}} + \Big) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Goodall instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Goodall, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Goodall similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Goodall similarity + + Examples + -------- + >>> cmp = Goodall() + >>> cmp.sim('cat', 'hat') + 0.9544884026871964 + >>> cmp.sim('Niall', 'Neil') + 0.9397552079794624 + >>> cmp.sim('aluminum', 'Catalan') + 0.9117156301536503 + >>> cmp.sim('ATCG', 'TAGC') + 0.9279473952929225 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return 2 / pi * asin(((a + d) / n) ** 0.5) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_goodman_kruskal_lambda.py b/abydos/distance/_goodman_kruskal_lambda.py new file mode 100644 index 000000000..579251d76 --- /dev/null +++ b/abydos/distance/_goodman_kruskal_lambda.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._goodman_kruskal_lambda. + +Goodman & Kruskal's Lambda similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GoodmanKruskalLambda'] + + +class GoodmanKruskalLambda(_TokenDistance): + r"""Goodman & Kruskal's Lambda similarity. + + For two sets X and Y and a population N, Goodman & Kruskal's lambda + :cite:`Goodman:1954` is + + .. math:: + + sim_{GK_\lambda}(X, Y) = + \frac{\frac{1}{2}(max(|X \cap Y|, |X \setminus Y|)+ + max(|Y \setminus X|, |(N \setminus X) \setminus Y|)+ + max(|X \cap Y|, |Y \setminus X|)+ + max(|X \setminus Y|, |(N \setminus X) \setminus Y|))- + (max(|X|, |N \setminus X|)+max(|Y|, |N \setminus Y|))} + {|N|-\frac{1}{2}(max(|X|, |N \setminus X|)+ + max(|Y|, |N \setminus Y|))} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{GK_\lambda} = + \frac{\frac{1}{2}((max(a,b)+max(c,d)+max(a,c)+max(b,d))- + (max(a+b,c+d)+max(a+c,b+d)))} + {n-\frac{1}{2}(max(a+b,c+d)+max(a+c,b+d))} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize GoodmanKruskalLambda instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GoodmanKruskalLambda, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return Goodman & Kruskal's Lambda similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Goodman & Kruskal's Lambda similarity + + Examples + -------- + >>> cmp = GoodmanKruskalLambda() + >>> cmp.sim('cat', 'hat') + 0.0 + >>> cmp.sim('Niall', 'Neil') + 0.0 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + sigma = max(a, b) + max(c, d) + max(a, c) + max(b, d) + sigma_prime = max(a + c, b + d) + max(a + b, c + d) + num = sigma - sigma_prime + + if num: + return num / (2 * (a + b + c + d) - sigma_prime) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_goodman_kruskal_lambda_r.py b/abydos/distance/_goodman_kruskal_lambda_r.py new file mode 100644 index 000000000..80ade7839 --- /dev/null +++ b/abydos/distance/_goodman_kruskal_lambda_r.py @@ -0,0 +1,200 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._goodman_kruskal_lambda_r. + +Goodman & Kruskal Lambda-r correlation. +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GoodmanKruskalLambdaR'] + + +class GoodmanKruskalLambdaR(_TokenDistance): + r"""Goodman & Kruskal Lambda-r correlation. + + For two sets X and Y and a population N, Goodman & Kruskal + :math:`\lambda_r` correlation :cite:`Goodman:1954` is + + .. math:: + + corr_{GK_{\lambda_r}}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y| - + \frac{1}{2}(max(|X|, |N \setminus X|) + max(|Y|, |N \setminus Y|))} + {|N| - + \frac{1}{2}(max(|X|, |N \setminus X|) + max(|Y|, |N \setminus Y|))} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{GK_{\lambda_r}} = + \frac{a + d - \frac{1}{2}(max(a+b,c+d)+max(a+c,b+d))} + {n - \frac{1}{2}(max(a+b,c+d)+max(a+c,b+d))} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize GoodmanKruskalLambdaR instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GoodmanKruskalLambdaR, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return Goodman & Kruskal Lambda-r correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Goodman & Kruskal Lambda-r correlation + + Examples + -------- + >>> cmp = GoodmanKruskalLambdaR() + >>> cmp.corr('cat', 'hat') + 0.0 + >>> cmp.corr('Niall', 'Neil') + -0.2727272727272727 + >>> cmp.corr('aluminum', 'Catalan') + -0.7647058823529411 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + if not self._src_card() or not self._tar_card(): + return -1.0 + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + sigma_prime = max(a + b, c + d) + max(a + c, b + d) + + num = 2 * (a + d) - sigma_prime + + if num: + return num / (2 * n - sigma_prime) + return 0.0 + + def sim(self, src, tar): + """Return Goodman & Kruskal Lambda-r similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Goodman & Kruskal Lambda-r similarity + + Examples + -------- + >>> cmp = GoodmanKruskalLambdaR() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36363636363636365 + >>> cmp.sim('aluminum', 'Catalan') + 0.11764705882352944 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_goodman_kruskal_tau_a.py b/abydos/distance/_goodman_kruskal_tau_a.py new file mode 100644 index 000000000..2ca5a3eda --- /dev/null +++ b/abydos/distance/_goodman_kruskal_tau_a.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._goodman_kruskal_tau_a. + +Goodman & Kruskal's Tau A similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GoodmanKruskalTauA'] + + +class GoodmanKruskalTauA(_TokenDistance): + r"""Goodman & Kruskal's Tau A similarity. + + For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a` + similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is + + .. math:: + + sim_{GK_{\tau_a}}(X, Y) = + \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + + \frac{|Y \setminus X|}{|N|}^2}{\frac{|Y|}{|N|}}+ + \frac{\frac{|X \setminus Y|}{|N|}^2 + + \frac{|(N \setminus X) \setminus Y|}{|N|}^2} + {\frac{|N \setminus X|}{|N|}} - + (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} + {1 - (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + after each term has been converted to a proportion by dividing by n, this + is + + .. math:: + + sim_{GK_{\tau_a}} = + \frac{ + \frac{a^2 + c^2}{a+c} + + \frac{b^2 + d^2}{b+d} - + ((a+b)^2 + (c+d)^2)} + {1 - ((a+b)^2 + (c+d)^2)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + normalizer='proportional', + **kwargs + ): + """Initialize GoodmanKruskalTauA instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + normalizer : str + Specifies the normalization type. See :ref:`normalizer ` + description in :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GoodmanKruskalTauA, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + normalizer=normalizer, + **kwargs + ) + + def sim(self, src, tar): + """Return Goodman & Kruskal's Tau A similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Goodman & Kruskal's Tau A similarity + + Examples + -------- + >>> cmp = GoodmanKruskalTauA() + >>> cmp.sim('cat', 'hat') + 0.3304969657208484 + >>> cmp.sim('Niall', 'Neil') + 0.22137604585914503 + >>> cmp.sim('aluminum', 'Catalan') + 0.05991264724130685 + >>> cmp.sim('ATCG', 'TAGC') + 4.119695274745721e-05 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if a + b == 0 or a + c == 0: + return 0.0 + + fp = (a * a + c * c) / (a + c) + + sp = b * b + d * d + if sp: + sp /= b + d + + num = fp + sp - (a + b) ** 2 - (c + d) ** 2 + if num > 1e-14: + return num / (1 - (a + b) ** 2 - (c + d) ** 2) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_goodman_kruskal_tau_b.py b/abydos/distance/_goodman_kruskal_tau_b.py new file mode 100644 index 000000000..282b24df3 --- /dev/null +++ b/abydos/distance/_goodman_kruskal_tau_b.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._goodman_kruskal_tau_b. + +Goodman & Kruskal's Tau B similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GoodmanKruskalTauB'] + + +class GoodmanKruskalTauB(_TokenDistance): + r"""Goodman & Kruskal's Tau B similarity. + + For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b` + similarity :cite:`Goodman:1954` is + + .. math:: + + sim_{GK_{\tau_b}}(X, Y) = + \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + + \frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+ + \frac{\frac{|Y \setminus X|}{|N|}^2 + + \frac{|(N \setminus X) \setminus Y|}{|N|}^2} + {\frac{|N \setminus X|}{|N|}} - + (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} + {1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + after each term has been converted to a proportion by dividing by n, this + is + + .. math:: + + sim_{GK_{\tau_b}} = + \frac{ + \frac{a^2 + b^2}{a+b} + + \frac{c^2 + d^2}{c+d} - + ((a+c)^2 + (b+d)^2)} + {1 - ((a+c)^2 + (b+d)^2)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + normalizer='proportional', + **kwargs + ): + """Initialize GoodmanKruskalTauB instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + normalizer : str + Specifies the normalization type. See :ref:`normalizer ` + description in :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GoodmanKruskalTauB, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + normalizer=normalizer, + **kwargs + ) + + def sim(self, src, tar): + """Return Goodman & Kruskal's Tau B similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Goodman & Kruskal's Tau B similarity + + Examples + -------- + >>> cmp = GoodmanKruskalTauB() + >>> cmp.sim('cat', 'hat') + 0.3304969657208484 + >>> cmp.sim('Niall', 'Neil') + 0.2346006486710202 + >>> cmp.sim('aluminum', 'Catalan') + 0.06533810992392582 + >>> cmp.sim('ATCG', 'TAGC') + 4.119695274745721e-05 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if a + b == 0 or a + c == 0: + return 0.0 + + fp = (a * a + b * b) / (a + b) + + sp = c * c + d * d + if sp: + sp /= c + d + + num = fp + sp - (a + c) ** 2 - (b + d) ** 2 + if num > 1e-14: + return num / (1 - (a + c) ** 2 - (b + d) ** 2) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_gotoh.py b/abydos/distance/_gotoh.py index abf565f05..1888ded52 100644 --- a/abydos/distance/_gotoh.py +++ b/abydos/distance/_gotoh.py @@ -28,6 +28,8 @@ unicode_literals, ) +from deprecation import deprecated + from numpy import float32 as np_float32 from numpy import zeros as np_zeros @@ -35,6 +37,7 @@ from ._ident import sim_ident from ._needleman_wunsch import NeedlemanWunsch +from .. import __version__ __all__ = ['Gotoh', 'gotoh'] @@ -44,17 +47,15 @@ class Gotoh(NeedlemanWunsch): The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with affine gap penalties. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): - """Return the Gotoh score of two strings. + def __init__(self, gap_open=1, gap_ext=0.4, sim_func=None, **kwargs): + """Initialize Gotoh instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison gap_open : float The cost of an open alignment gap (1 by default) gap_ext : float @@ -62,6 +63,29 @@ def dist_abs(self, src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): sim_func : function A function that returns the similarity of two characters (identity similarity by default) + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Gotoh, self).__init__(**kwargs) + self._gap_open = gap_open + self._gap_ext = gap_ext + self._sim_func = sim_func + if self._sim_func is None: + self._sim_func = NeedlemanWunsch.sim_matrix + + def dist_abs(self, src, tar): + """Return the Gotoh score of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -80,6 +104,11 @@ def dist_abs(self, src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): >>> cmp.dist_abs('cat', 'hat') 2.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) @@ -90,18 +119,18 @@ def dist_abs(self, src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): q_mat[0, 0] = float('-inf') for i in range(1, len(src) + 1): d_mat[i, 0] = float('-inf') - p_mat[i, 0] = -gap_open - gap_ext * (i - 1) + p_mat[i, 0] = -self._gap_open - self._gap_ext * (i - 1) q_mat[i, 0] = float('-inf') - q_mat[i, 1] = -gap_open + q_mat[i, 1] = -self._gap_open for j in range(1, len(tar) + 1): d_mat[0, j] = float('-inf') p_mat[0, j] = float('-inf') - p_mat[1, j] = -gap_open - q_mat[0, j] = -gap_open - gap_ext * (j - 1) + p_mat[1, j] = -self._gap_open + q_mat[0, j] = -self._gap_open - self._gap_ext * (j - 1) for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): - sim_val = sim_func(src[i - 1], tar[j - 1]) + sim_val = self._sim_func(src[i - 1], tar[j - 1]) d_mat[i, j] = max( d_mat[i - 1, j - 1] + sim_val, p_mat[i - 1, j - 1] + sim_val, @@ -109,17 +138,25 @@ def dist_abs(self, src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): ) p_mat[i, j] = max( - d_mat[i - 1, j] - gap_open, p_mat[i - 1, j] - gap_ext + d_mat[i - 1, j] - self._gap_open, + p_mat[i - 1, j] - self._gap_ext, ) q_mat[i, j] = max( - d_mat[i, j - 1] - gap_open, q_mat[i, j - 1] - gap_ext + d_mat[i, j - 1] - self._gap_open, + q_mat[i, j - 1] - self._gap_ext, ) i, j = (n - 1 for n in d_mat.shape) return max(d_mat[i, j], p_mat[i, j], q_mat[i, j]) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Gotoh.dist_abs method instead.', +) def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): """Return the Gotoh score of two strings. @@ -155,8 +192,10 @@ def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): >>> gotoh('cat', 'hat') 2.0 + .. versionadded:: 0.1.0 + """ - return Gotoh().dist_abs(src, tar, gap_open, gap_ext, sim_func) + return Gotoh(gap_open, gap_ext, sim_func).dist_abs(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_gower_legendre.py b/abydos/distance/_gower_legendre.py new file mode 100644 index 000000000..7a645461c --- /dev/null +++ b/abydos/distance/_gower_legendre.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._gower_legendre. + +Gower & Legendre similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GowerLegendre'] + + +class GowerLegendre(_TokenDistance): + r"""Gower & Legendre similarity. + + For two sets X and Y and a population N, the Gower & Legendre similarity + :cite:`Gower:1986` is + + .. math:: + + sim_{GowerLegendre}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y|} + {|X \cap Y| + |(N \setminus X) \setminus Y| + + \theta \cdot |X \triangle Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{GowerLegendre} = + \frac{a+d}{a+\theta(b+c)+d} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + theta=0.5, + **kwargs + ): + """Initialize GowerLegendre instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + theta : float + The weight to place on the symmetric difference. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + self.theta = theta + super(GowerLegendre, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Gower & Legendre similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gower & Legendre similarity + + Examples + -------- + >>> cmp = GowerLegendre() + >>> cmp.sim('cat', 'hat') + 0.9974424552429667 + >>> cmp.sim('Niall', 'Neil') + 0.9955156950672646 + >>> cmp.sim('aluminum', 'Catalan') + 0.9903536977491961 + >>> cmp.sim('ATCG', 'TAGC') + 0.993581514762516 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + apd = self._intersection_card() + self._total_complement_card() + bpc = self._src_only_card() + self._tar_only_card() + + return apd / (apd + self.theta * bpc) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_guttman_lambda_a.py b/abydos/distance/_guttman_lambda_a.py new file mode 100644 index 000000000..c1ac8adbf --- /dev/null +++ b/abydos/distance/_guttman_lambda_a.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._guttman_lambda_a. + +Guttman's Lambda A similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GuttmanLambdaA'] + + +class GuttmanLambdaA(_TokenDistance): + r"""Guttman's Lambda A similarity. + + For two sets X and Y and a population N, Guttman's :math:`\lambda_a` + similarity :cite:`Guttman:1941` is + + .. math:: + + sim_{Guttman_{\lambda_a}}(X, Y) = + \frac{max(|X \cap Y|, |Y \setminus X|) + max(|X \setminus Y|, + |(N \setminus X) \setminus Y|) - max(|X|, |N \setminus X|)} + {|N| - max(|X|, |N \setminus X|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Guttman_{\lambda_a}} = + \frac{max(a, c) + max(b, d) - max(a+b, c+d)}{n - max(a+b, c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize GuttmanLambdaA instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GuttmanLambdaA, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Guttman Lambda A similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Guttman's Lambda A similarity + + Examples + -------- + >>> cmp = GuttmanLambdaA() + >>> cmp.sim('cat', 'hat') + 0.0 + >>> cmp.sim('Niall', 'Neil') + 0.0 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + num = round(float(max(a, c) + max(b, d) - max(a + b, c + d)), 15) + if num > 1e-8: + return num / float(n - max(a + b, c + d)) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_guttman_lambda_b.py b/abydos/distance/_guttman_lambda_b.py new file mode 100644 index 000000000..d34253b03 --- /dev/null +++ b/abydos/distance/_guttman_lambda_b.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._guttman_lambda_b. + +Guttman's Lambda B similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GuttmanLambdaB'] + + +class GuttmanLambdaB(_TokenDistance): + r"""Guttman's Lambda B similarity. + + For two sets X and Y and a population N, Guttman's :math:`\lambda_b` + similarity :cite:`Guttman:1941` is + + .. math:: + + sim_{Guttman_{\lambda_b}}(X, Y) = + \frac{max(|X \cap Y|, |X \setminus Y|) + max(|Y \setminus X|, + |(N \setminus X) \setminus Y|) - max(|Y|, |N \setminus Y|)} + {|N| - max(|Y|, |N \setminus Y|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Guttman_{\lambda_b}} = + \frac{max(a, b) + max(c, d) - max(a+c, b+d)}{n - max(a+c, b+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize GuttmanLambdaB instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GuttmanLambdaB, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Guttman Lambda B similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Guttman's Lambda B similarity + + Examples + -------- + >>> cmp = GuttmanLambdaB() + >>> cmp.sim('cat', 'hat') + 0.0 + >>> cmp.sim('Niall', 'Neil') + 0.0 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + num = round(float(max(a, b) + max(c, d) - max(a + c, b + d)), 15) + if num > 1e-8: + return num / float(n - max(a + c, b + d)) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_gwet_ac.py b/abydos/distance/_gwet_ac.py new file mode 100644 index 000000000..c54775076 --- /dev/null +++ b/abydos/distance/_gwet_ac.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._gwet_ac. + +Gwet's AC correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['GwetAC'] + + +class GwetAC(_TokenDistance): + r"""Gwet's AC correlation. + + For two sets X and Y and a population N, Gwet's AC correlation + :cite:`Gwet:2008` is + + .. math:: + + corr_{Gwet_{AC}}(X, Y) = AC = + \frac{p_o - p_e^{AC}}{1 - p_e^{AC}} + + where + + .. math:: + + \begin{array}{lll} + p_o &=&\frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|} + + p_e^{AC}&=&\frac{1}{2}\Big(\frac{|X|+|Y|}{|N|}\cdot + \frac{|X \setminus Y| + |Y \setminus X|}{|N|}\Big) + \end{array} + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + \begin{array}{lll} + p_o&=&\frac{a+d}{n} + + p_e^{AC}&=&\frac{1}{2}\Big(\frac{2a+b+c}{n}\cdot + \frac{2d+b+c}{n}\Big) + \end{array} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize GwetAC instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(GwetAC, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Gwet's AC correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gwet's AC correlation + + Examples + -------- + >>> cmp = GwetAC() + >>> cmp.corr('cat', 'hat') + 0.9948456319360438 + >>> cmp.corr('Niall', 'Neil') + 0.990945276504824 + >>> cmp.corr('aluminum', 'Catalan') + 0.9804734301840141 + >>> cmp.corr('ATCG', 'TAGC') + 0.9870811678360627 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + po = (a + d) / n + q = (2 * a + b + c) / (2 * n) + pe = 2 * q * (1 - q) + + return (po - pe) / (1 - pe) + + def sim(self, src, tar): + """Return the Gwet's AC similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Gwet's AC similarity + + Examples + -------- + >>> cmp = GwetAC() + >>> cmp.sim('cat', 'hat') + 0.9974228159680218 + >>> cmp.sim('Niall', 'Neil') + 0.995472638252412 + >>> cmp.sim('aluminum', 'Catalan') + 0.9902367150920071 + >>> cmp.sim('ATCG', 'TAGC') + 0.9935405839180314 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_hamann.py b/abydos/distance/_hamann.py new file mode 100644 index 000000000..1f67a2f39 --- /dev/null +++ b/abydos/distance/_hamann.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._hamann. + +Hamann correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Hamann'] + + +class Hamann(_TokenDistance): + r"""Hamann correlation. + + For two sets X and Y and a population N, the Hamann correlation + :cite:`Hamann:1961` is + + .. math:: + + corr_{Hamann}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y| - + |X \setminus Y| - |Y \setminus X|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Hamann} = + \frac{a+d-b-c}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Hamann instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Hamann, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Hamann correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Hamann correlation + + Examples + -------- + >>> cmp = Hamann() + >>> cmp.corr('cat', 'hat') + 0.9897959183673469 + >>> cmp.corr('Niall', 'Neil') + 0.9821428571428571 + >>> cmp.corr('aluminum', 'Catalan') + 0.9617834394904459 + >>> cmp.corr('ATCG', 'TAGC') + 0.9744897959183674 + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + return ( + self._intersection_card() + + self._total_complement_card() + - self._src_only_card() + - self._tar_only_card() + ) / self._population_unique_card() + + def sim(self, src, tar): + """Return the normalized Hamann similarity of two strings. + + Hamann similarity, which has a range [-1, 1] is normalized to [0, 1] by + adding 1 and dividing by 2. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Hamann similarity + + Examples + -------- + >>> cmp = Hamann() + >>> cmp.sim('cat', 'hat') + 0.9948979591836735 + >>> cmp.sim('Niall', 'Neil') + 0.9910714285714286 + >>> cmp.sim('aluminum', 'Catalan') + 0.9808917197452229 + >>> cmp.sim('ATCG', 'TAGC') + 0.9872448979591837 + + .. versionadded:: 0.4.0 + + """ + return (self.corr(src, tar) + 1) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_hamming.py b/abydos/distance/_hamming.py index 70a777fc2..764806f9f 100644 --- a/abydos/distance/_hamming.py +++ b/abydos/distance/_hamming.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ __all__ = ['Hamming', 'dist_hamming', 'hamming', 'sim_hamming'] @@ -41,17 +44,15 @@ class Hamming(_Distance): it is not normally defined. By default, this implementation calculates the Hamming distance of the first n characters where n is the lesser of the two strings' lengths and adds to this the difference in string lengths. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, diff_lens=True): - """Return the Hamming distance between two strings. + def __init__(self, diff_lens=True, **kwargs): + """Initialize Hamming instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison diff_lens : bool If True (default), this returns the Hamming distance for those characters that have a matching character in both strings plus the @@ -59,6 +60,25 @@ def dist_abs(self, src, tar, diff_lens=True): the shorter string with obligatorily non-matching characters. If False, an exception is raised in the case of strings of unequal lengths. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Hamming, self).__init__(**kwargs) + self._diff_lens = diff_lens + + def dist_abs(self, src, tar): + """Return the Hamming distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -83,8 +103,13 @@ def dist_abs(self, src, tar, diff_lens=True): >>> cmp.dist_abs('ATCG', 'TAGC') 4 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - if not diff_lens and len(src) != len(tar): + if not self._diff_lens and len(src) != len(tar): raise ValueError( 'Undefined for sequences of unequal length; set diff_lens ' + 'to True for Hamming distance between strings of unequal ' @@ -92,13 +117,13 @@ def dist_abs(self, src, tar, diff_lens=True): ) hdist = 0 - if diff_lens: + if self._diff_lens: hdist += abs(len(src) - len(tar)) hdist += sum(c1 != c2 for c1, c2 in zip(src, tar)) return hdist - def dist(self, src, tar, diff_lens=True): + def dist(self, src, tar): """Return the normalized Hamming distance between two strings. Hamming distance normalized to the interval [0, 1]. @@ -115,13 +140,6 @@ def dist(self, src, tar, diff_lens=True): Source string for comparison tar : str Target string for comparison - diff_lens : bool - If True (default), this returns the Hamming distance for those - characters that have a matching character in both strings plus the - difference in the strings' lengths. This is equivalent to extending - the shorter string with obligatorily non-matching characters. If - False, an exception is raised in the case of strings of unequal - lengths. Returns ------- @@ -140,12 +158,23 @@ def dist(self, src, tar, diff_lens=True): >>> cmp.dist('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 - return self.dist_abs(src, tar, diff_lens) / max(len(src), len(tar)) + return self.dist_abs(src, tar) / max(len(src), len(tar)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Hamming.dist_abs method instead.', +) def hamming(src, tar, diff_lens=True): """Return the Hamming distance between two strings. @@ -180,10 +209,18 @@ def hamming(src, tar, diff_lens=True): >>> hamming('ATCG', 'TAGC') 4 + .. versionadded:: 0.1.0 + """ - return Hamming().dist_abs(src, tar, diff_lens) + return Hamming(diff_lens).dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Hamming.dist method instead.', +) def dist_hamming(src, tar, diff_lens=True): """Return the normalized Hamming distance between two strings. @@ -218,10 +255,18 @@ def dist_hamming(src, tar, diff_lens=True): >>> dist_hamming('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ - return Hamming().dist(src, tar, diff_lens) + return Hamming(diff_lens).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Hamming.sim method instead.', +) def sim_hamming(src, tar, diff_lens=True): """Return the normalized Hamming similarity of two strings. @@ -256,8 +301,10 @@ def sim_hamming(src, tar, diff_lens=True): >>> sim_hamming('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ - return Hamming().sim(src, tar, diff_lens) + return Hamming(diff_lens).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_harris_lahey.py b/abydos/distance/_harris_lahey.py new file mode 100644 index 000000000..ca3749f20 --- /dev/null +++ b/abydos/distance/_harris_lahey.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._harris_lahey. + +Harris & Lahey similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['HarrisLahey'] + + +class HarrisLahey(_TokenDistance): + r"""Harris & Lahey similarity. + + For two sets X and Y and a population N, Harris & Lahey similarity + :cite:`Harris:1978` is + + .. math:: + + sim_{HarrisLahey}(X, Y) = + \frac{|X \cap Y|}{|X \cup Y|}\cdot + \frac{|N \setminus Y| + |N \setminus X|}{2|N|}+ + \frac{|(N \setminus X) \setminus Y|}{|N \setminus (X \cap Y)|}\cdot + \frac{|X| + |Y|}{2|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{HarrisLahey} = + \frac{a}{a+b+c}\cdot\frac{2d+b+c}{2n}+ + \frac{d}{d+b+c}\cdot\frac{2a+b+c}{2n} + + Notes + ----- + Most catalogs of similarity coefficients + :cite:`Warrens:2008,Morris:2012,Xiang:2013` omit the :math:`n` terms in the + denominators, but the worked example in :cite:`Harris:1978` makes it clear + that this is intended in the original. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize HarrisLahey instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(HarrisLahey, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Harris & Lahey similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Harris & Lahey similarity + + Examples + -------- + >>> cmp = HarrisLahey() + >>> cmp.sim('cat', 'hat') + 0.3367085964820711 + >>> cmp.sim('Niall', 'Neil') + 0.22761577457069784 + >>> cmp.sim('aluminum', 'Catalan') + 0.07244410503054725 + >>> cmp.sim('ATCG', 'TAGC') + 0.006296204706372345 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + score = 0.0 + if a and (d + b + c): + score += a / (a + b + c) * (2 * d + b + c) / (2 * n) + if d and (a + b + c): + score += d / (d + b + c) * (2 * a + b + c) / (2 * n) + return score + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_hassanat.py b/abydos/distance/_hassanat.py new file mode 100644 index 000000000..a4f3f975f --- /dev/null +++ b/abydos/distance/_hassanat.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._hassanat. + +Hassanat distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Hassanat'] + + +class Hassanat(_TokenDistance): + r"""Hassanat distance. + + For two multisets X and Y drawn from an alphabet S, Hassanat distance + :cite:`Hassanat:2014` is + + .. math:: + + dist_{Hassanat}(X, Y) = \sum_{i \in S} D(X_i, Y_i) + + where + + .. math:: + + D(X_i, Y_i) = + \left\{\begin{array}{ll} + 1-\frac{1+min(X_i, Y_i)}{1+max(X_i, Y_i)}&, + min(X_i, Y_i) \geq 0 + \\ + \\ + 1-\frac{1+min(X_i, Y_i)+|min(X_i, Y_i)|} + {1+max(X_i, Y_i)+|min(X_i, Y_i)|}&, + min(X_i, Y_i) < 0 + \end{array}\right. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Hassanat instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Hassanat, self).__init__(tokenizer=tokenizer, **kwargs) + + def dist_abs(self, src, tar): + """Return the Hassanat distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Hassanat distance + + Examples + -------- + >>> cmp = Hassanat() + >>> cmp.dist_abs('cat', 'hat') + 2.0 + >>> cmp.dist_abs('Niall', 'Neil') + 3.5 + >>> cmp.dist_abs('aluminum', 'Catalan') + 7.166666666666667 + >>> cmp.dist_abs('ATCG', 'TAGC') + 5.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + distance = 0.0 + + for tok in self._total().keys(): + x = self._src_tokens[tok] + y = self._tar_tokens[tok] + + min_val = min(x, y) + if min_val >= 0: + distance += 1 - (1 + min_val) / (1 + max(x, y)) + else: + distance += 1 - 1 / (1 + max(x, y) - min_val) + + return distance + + def dist(self, src, tar): + """Return the normalized Hassanat distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Hassanat distance + + Examples + -------- + >>> cmp = Hassanat() + >>> cmp.dist('cat', 'hat') + 0.3333333333333333 + >>> cmp.dist('Niall', 'Neil') + 0.3888888888888889 + >>> cmp.dist('aluminum', 'Catalan') + 0.4777777777777778 + >>> cmp.dist('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + return self.dist_abs(src, tar) / len(self._total().keys()) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_hawkins_dotson.py b/abydos/distance/_hawkins_dotson.py new file mode 100644 index 000000000..96a2917f6 --- /dev/null +++ b/abydos/distance/_hawkins_dotson.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._hawkins_dotson. + +Hawkins & Dotson similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['HawkinsDotson'] + + +class HawkinsDotson(_TokenDistance): + r"""Hawkins & Dotson similarity. + + For two sets X and Y and a population N, Hawkins & Dotson similarity + :cite:`Hawkins:1973` is the mean of the occurrence agreement and + non-occurrence agreement + + .. math:: + + sim_{HawkinsDotson}(X, Y) = + \frac{1}{2}\cdot\Big( + \frac{|X \cap Y|}{|X \cup Y|}+ + \frac{|(N \setminus X) \setminus Y|}{|N \setminus (X \cap Y)|} + \Big) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{HawkinsDotson} = + \frac{1}{2}\cdot\Big(\frac{a}{a+b+c}+\frac{d}{b+c+d}\Big) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize HawkinsDotson instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(HawkinsDotson, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Hawkins & Dotson similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Hawkins & Dotson similarity + + Examples + -------- + >>> cmp = HawkinsDotson() + >>> cmp.sim('cat', 'hat') + 0.6641091219096334 + >>> cmp.sim('Niall', 'Neil') + 0.606635407786303 + >>> cmp.sim('aluminum', 'Catalan') + 0.5216836734693877 + >>> cmp.sim('ATCG', 'TAGC') + 0.49362244897959184 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + score = 0.0 + if a: + score += a / (a + b + c) + if d: + score += d / (b + c + d) + return score / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_hellinger.py b/abydos/distance/_hellinger.py new file mode 100644 index 000000000..092d2c793 --- /dev/null +++ b/abydos/distance/_hellinger.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._hellinger. + +Hellinger distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Hellinger'] + + +class Hellinger(_TokenDistance): + r"""Hellinger distance. + + For two multisets X and Y drawn from an alphabet S, Hellinger distance + :cite:`Hellinger:1909` is + + .. math:: + + dist_{Hellinger}(X, Y) = + \sqrt{2 \cdot \sum_{i \in S} (\sqrt{|A_i|} - \sqrt{|B_i|})^2} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Hellinger instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Hellinger, self).__init__(tokenizer=tokenizer, **kwargs) + + def dist_abs(self, src, tar): + """Return the Hellinger distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Hellinger distance + + Examples + -------- + >>> cmp = Hellinger() + >>> cmp.dist_abs('cat', 'hat') + 2.8284271247461903 + >>> cmp.dist_abs('Niall', 'Neil') + 3.7416573867739413 + >>> cmp.dist_abs('aluminum', 'Catalan') + 5.477225575051661 + >>> cmp.dist_abs('ATCG', 'TAGC') + 4.47213595499958 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._total().keys() + + return ( + 2 + * sum( + ( + (abs(self._src_tokens[tok])) ** 0.5 + - (abs(self._tar_tokens[tok])) ** 0.5 + ) + ** 2 + for tok in alphabet + ) + ) ** 0.5 + + def dist(self, src, tar): + """Return the normalized Hellinger distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Hellinger distance + + Examples + -------- + >>> cmp = Hellinger() + >>> cmp.dist('cat', 'hat') + 0.8164965809277261 + >>> cmp.dist('Niall', 'Neil') + 0.881917103688197 + >>> cmp.dist('aluminum', 'Catalan') + 0.9128709291752769 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + score = self.dist_abs(src, tar) + norm = ( + 2 + * sum( + max(self._src_tokens[tok], self._tar_tokens[tok]) ** 2 + for tok in self._total().keys() + ) + ) ** 0.5 + return score / norm + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_higuera_mico.py b/abydos/distance/_higuera_mico.py new file mode 100644 index 000000000..0a7a27ac6 --- /dev/null +++ b/abydos/distance/_higuera_mico.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._higuera_mico. + +The Higuera-Micó contextual normalized edit distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from numpy import full as np_full + +from six.moves import range + +from ._distance import _Distance + +__all__ = ['HigueraMico'] + + +class HigueraMico(_Distance): + """The Higuera-Micó contextual normalized edit distance. + + This is presented in :cite:`Higuera:2008`. + + This measure is not normalized to a particular range. Indeed, for an + string of infinite length as and a string of 0 length, the contextual + normalized edit distance would be infinity. But so long as the relative + difference in string lengths is not too great, the distance will generally + remain below 1.0 + + Notes + ----- + The "normalized" version of this distance, implemented in the dist method + is merely the minimum of the distance and 1.0. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, **kwargs): + """Initialize Levenshtein instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(HigueraMico, self).__init__(**kwargs) + + def dist_abs(self, src, tar): + """Return the Higuera-Micó distance between two strings. + + This is a straightforward implementation of Higuera & Micó pseudocode + from :cite:`Higuera:2008`, ported to Numpy. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The Higuera-Micó distance between src & tar + + Examples + -------- + >>> cmp = HigueraMico() + >>> cmp.dist_abs('cat', 'hat') + 0.3333333333333333 + >>> cmp.dist_abs('Niall', 'Neil') + 0.5333333333333333 + >>> cmp.dist_abs('aluminum', 'Catalan') + 0.7916666666666667 + >>> cmp.dist_abs('ATCG', 'TAGC') + 0.6000000000000001 + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + mx = np_full( + (len(src) + 1, len(tar) + 1, len(src) + len(tar) + 1), + fill_value=float('-inf'), + dtype=float, + ) + + for i in range(1, len(src) + 1): + mx[i, 0, i] = 0 + for j in range(len(tar) + 1): + mx[0, j, j] = j + for i in range(1, len(src) + 1): + for j in range(1, len(tar) + 1): + if src[i - 1] == tar[j - 1]: + for k in range(len(src) + len(tar) + 1): + mx[i, j, k] = mx[i - 1, j - 1, k] + else: + for k in range(1, len(src) + len(tar) + 1): + mx[i, j, k] = mx[i - 1, j - 1, k - 1] + for k in range(1, len(src) + len(tar) + 1): + mx[i, j, k] = max( + mx[i - 1, j, k - 1], + mx[i, j - 1, k - 1] + 1, + mx[i, j, k], + ) + + min_dist = float('inf') + for k in range(len(src) + len(tar) + 1): + if mx[len(src), len(tar), k] >= 0: + n_i = int(mx[len(src), len(tar), k]) + n_d = len(src) - len(tar) + n_i + n_s = k - (n_i + n_d) + loc_dist = 0 + for i in range(len(src) + 1, len(src) + n_i + 1): + loc_dist += 1 / i + loc_dist += n_s / (len(src) + n_i) + for i in range(len(tar) + 1, len(tar) + n_d + 1): + loc_dist += 1 / i + if loc_dist < min_dist: + min_dist = loc_dist + + return min_dist + + def dist(self, src, tar): + """Return the bounded Higuera-Micó distance between two strings. + + This is the distance bounded to the range [0, 1]. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The bounded Higuera-Micó distance between src & tar + + Examples + -------- + >>> cmp = HigueraMico() + >>> cmp.dist('cat', 'hat') + 0.3333333333333333 + >>> cmp.dist('Niall', 'Neil') + 0.5333333333333333 + >>> cmp.dist('aluminum', 'Catalan') + 0.7916666666666667 + >>> cmp.dist('ATCG', 'TAGC') + 0.6000000000000001 + + .. versionadded:: 0.4.0 + + """ + return min(1.0, self.dist_abs(src, tar)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_hurlbert.py b/abydos/distance/_hurlbert.py new file mode 100644 index 000000000..4b09d4025 --- /dev/null +++ b/abydos/distance/_hurlbert.py @@ -0,0 +1,243 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._hurlbert. + +Hurlbert correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import ceil, copysign, floor + +from ._token_distance import _TokenDistance + +__all__ = ['Hurlbert'] + + +class Hurlbert(_TokenDistance): + r"""Hurlbert correlation. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + Hurlbert's coefficient of interspecific association :cite:`Hurlbert:1969` + is + + .. math:: + + corr_{Hurlbert} = + \frac{ad-bc}{|ad-bc|} \sqrt{\frac{Obs_{\chi^2}-Min_{\chi^2}} + {Max_{\chi^2}-Min_{\chi^2}}} + + Where: + + .. math:: + + \begin{array}{lll} + Obs_{\chi^2} &= \frac{(ad-bc)^2n}{(a+b)(a+c)(b+d)(c+d)} + + Max_{\chi^2} &= \frac{(a+b)(b+d)n}{(a+c)(c+d)} &\textrm{ when } + ad \geq bc + + Max_{\chi^2} &= \frac{(a+b)(a+c)n}{(b+d)(c+d)} &\textrm{ when } + ad < bc \textrm{ and } a \leq d + + Max_{\chi^2} &= \frac{(b+d)(c+d)n}{(a+b)(a+c)} &\textrm{ when } + ad < bc \textrm{ and } a > d + + Min_{\chi^2} &= \frac{n^3 (\hat{a} - g(\hat{a}))^2} + {(a+b)(a+c)(c+d)(b+d)} + + \textrm{where } \hat{a} &= \frac{(a+b)(a+c)}{n} + + \textrm{and } g(\hat{a}) &= \lfloor\hat{a}\rfloor + &\textrm{ when } ad < bc, + + \textrm{otherwise } g(\hat{a}) &= \lceil\hat{a}\rceil + \end{array} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Hurlbert instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Hurlbert, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Hurlbert correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Hurlbert correlation + + Examples + -------- + >>> cmp = Hurlbert() + >>> cmp.corr('cat', 'hat') + 0.497416003373807 + >>> cmp.corr('Niall', 'Neil') + 0.32899851514665707 + >>> cmp.corr('aluminum', 'Catalan') + 0.10144329225459262 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return -1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + admbc = a * d - b * c + marginals_product = ( + max(1.0, a + b) + * max(1.0, a + c) + * max(1.0, b + d) + * max(1.0, c + d) + ) + + obs_chisq = admbc * admbc * n / marginals_product + + if a * d >= b * c: + max_chisq = ( + (a + b) * (b + d) * n / (max(1.0, a + c) * max(1.0, c + d)) + ) + elif a <= d: + max_chisq = ( + (a + b) * (a + c) * n / (max(1.0, b + d) * max(1.0, c + d)) + ) + else: + max_chisq = ( + (b + d) * (c + d) * n / (max(1.0, a + b) * max(1.0, a + c)) + ) + + a_hat = (a + b) * (a + c) / n + g_a_hat = ceil(a_hat) if a * d < b * c else floor(a_hat) + + min_chisq = n ** 3 * (a_hat - g_a_hat) ** 2 / marginals_product + + num = obs_chisq - min_chisq + if num: + return copysign(abs(num / (max_chisq - min_chisq)) ** 0.5, admbc) + return 0.0 + + def sim(self, src, tar): + """Return the Hurlbert similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Hurlbert similarity + + Examples + -------- + >>> cmp = Hurlbert() + >>> cmp.sim('cat', 'hat') + 0.7487080016869034 + >>> cmp.sim('Niall', 'Neil') + 0.6644992575733285 + >>> cmp.sim('aluminum', 'Catalan') + 0.5507216461272963 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_ident.py b/abydos/distance/_ident.py index a29832926..ba2af44af 100644 --- a/abydos/distance/_ident.py +++ b/abydos/distance/_ident.py @@ -28,13 +28,19 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ __all__ = ['Ident', 'dist_ident', 'sim_ident'] class Ident(_Distance): - """Identity distance and similarity.""" + """Identity distance and similarity. + + .. versionadded:: 0.3.6 + """ def sim(self, src, tar): """Return the identity similarity of two strings. @@ -62,10 +68,21 @@ def sim(self, src, tar): >>> cmp.sim('cat', 'cat') 1.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return 1.0 if src == tar else 0.0 +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Ident.sim method instead.', +) def sim_ident(src, tar): """Return the identity similarity of two strings. @@ -91,10 +108,18 @@ def sim_ident(src, tar): >>> sim_ident('cat', 'cat') 1.0 + .. versionadded:: 0.1.0 + """ return Ident().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Ident.dist method instead.', +) def dist_ident(src, tar): """Return the identity distance between two strings. @@ -119,6 +144,8 @@ def dist_ident(src, tar): >>> dist_ident('cat', 'cat') 0.0 + .. versionadded:: 0.1.0 + """ return Ident().dist(src, tar) diff --git a/abydos/distance/_indel.py b/abydos/distance/_indel.py index 5aec926b2..69fd272c4 100644 --- a/abydos/distance/_indel.py +++ b/abydos/distance/_indel.py @@ -28,51 +28,38 @@ unicode_literals, ) -from ._distance import _Distance +from deprecation import deprecated + from ._levenshtein import Levenshtein +from .. import __version__ __all__ = ['Indel', 'dist_indel', 'indel', 'sim_indel'] -class Indel(_Distance): +class Indel(Levenshtein): """Indel distance. This is equivalent to Levenshtein distance, when only inserts and deletes are possible. - """ - _lev = Levenshtein() + .. versionadded:: 0.3.6 + + """ - def dist_abs(self, src, tar): - """Return the indel distance between two strings. + def __init__(self, **kwargs): + """Initialize Levenshtein instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison + **kwargs + Arbitrary keyword arguments - Returns - ------- - int - Indel distance - Examples - -------- - >>> cmp = Indel() - >>> cmp.dist_abs('cat', 'hat') - 2 - >>> cmp.dist_abs('Niall', 'Neil') - 3 - >>> cmp.dist_abs('Colin', 'Cuilen') - 5 - >>> cmp.dist_abs('ATCG', 'TAGC') - 4 + .. versionadded:: 0.4.0 """ - return self._lev.dist_abs( - src, tar, mode='lev', cost=(1, 1, 9999, 9999) + super(Indel, self).__init__( + mode='lev', cost=(1, 1, 9999, 9999), **kwargs ) def dist(self, src, tar): @@ -105,12 +92,21 @@ def dist(self, src, tar): >>> cmp.dist('ATCG', 'TAGC') 0.5 + + .. versionadded:: 0.3.6 + """ if src == tar: return 0.0 return self.dist_abs(src, tar) / (len(src) + len(tar)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Indel.dist_abs method instead.', +) def indel(src, tar): """Return the indel distance between two strings. @@ -137,10 +133,18 @@ def indel(src, tar): >>> indel('ATCG', 'TAGC') 4 + .. versionadded:: 0.3.0 + """ return Indel().dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Indel.dist method instead.', +) def dist_indel(src, tar): """Return the normalized indel distance between two strings. @@ -170,10 +174,18 @@ def dist_indel(src, tar): >>> dist_indel('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.3.0 + """ return Indel().dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Indel.sim method instead.', +) def sim_indel(src, tar): """Return the normalized indel similarity of two strings. @@ -203,6 +215,8 @@ def sim_indel(src, tar): >>> sim_indel('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.3.0 + """ return Indel().sim(src, tar) diff --git a/abydos/distance/_iterative_substring.py b/abydos/distance/_iterative_substring.py new file mode 100644 index 000000000..e0d0bdd4a --- /dev/null +++ b/abydos/distance/_iterative_substring.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._iterative_substring. + +Iterative-SubString (I-Sub) correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._distance import _Distance + +__all__ = ['IterativeSubString'] + + +class IterativeSubString(_Distance): + r"""Iterative-SubString correlation. + + Iterative-SubString (I-Sub) correlation :cite:`Stoilos:2005` + + This is a straightforward port of the primary author's Java implementation: + http://www.image.ece.ntua.gr/~gstoil/software/I_Sub.java + + .. versionadded:: 0.4.0 + """ + + def __init__(self, hamacher=0.6, normalize_strings=False, **kwargs): + """Initialize IterativeSubString instance. + + Parameters + ---------- + hamacher : float + The constant factor for the Hamacher product + normalize_strings : bool + Normalize the strings by removing the characters in '._ ' and + lower casing + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(IterativeSubString, self).__init__(**kwargs) + self._normalize_strings = normalize_strings + self._hamacher = hamacher + + def corr(self, src, tar): + """Return the Iterative-SubString correlation of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Iterative-SubString correlation + + Examples + -------- + >>> cmp = IterativeSubString() + >>> cmp.corr('cat', 'hat') + -1.0 + >>> cmp.corr('Niall', 'Neil') + -0.9 + >>> cmp.corr('aluminum', 'Catalan') + -1.0 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + input_src = src + input_tar = tar + + def _winkler_improvement(src, tar, commonality): + for i in range(min(len(src), len(tar))): + if src[i] != tar[i]: + break + return min(4, i) * 0.1 * (1 - commonality) + + if self._normalize_strings: + src = src.lower() + tar = tar.lower() + + for ch in '._ ': + src = src.replace(ch, '') + tar = tar.replace(ch, '') + + src_len = len(src) + tar_len = len(tar) + + if src_len == 0 and tar_len == 0: + return 1.0 + if src_len == 0 or tar_len == 0: + return -1.0 + + common = 0 + best = 2 + + while len(src) > 0 and len(tar) > 0 and best != 0: + best = 0 + + ls = len(src) + lt = len(tar) + + start_src = 0 + end_src = 0 + start_tar = 0 + end_tar = 0 + + i = 0 + while i < ls and ls - i > best: + j = 0 + while lt - j > best: + k = i + + while j < lt and src[k] != tar[j]: + j += 1 + + if j != lt: + p = j + j += 1 + k += 1 + while j < lt and k < ls and src[k] == tar[j]: + j += 1 + k += 1 + if k - i > best: + best = k - i + start_src = i + end_src = k + start_tar = p + end_tar = j + i += 1 + + src = src[:start_src] + src[end_src:] + tar = tar[:start_tar] + tar[end_tar:] + + if best > 2: + common += best + else: + best = 0 + + commonality = 2.0 * common / (src_len + tar_len) + winkler_improvement = _winkler_improvement( + input_src, input_tar, commonality + ) + + unmatched_src = max(src_len - common, 0) / src_len + unmatched_tar = max(tar_len - common, 0) / tar_len + + unmatched_prod = unmatched_src * unmatched_tar + dissimilarity = unmatched_prod / ( + self._hamacher + + (1 - self._hamacher) + * (unmatched_src + unmatched_tar - unmatched_prod) + ) + + return commonality - dissimilarity + winkler_improvement + + def sim(self, src, tar): + """Return the Iterative-SubString similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Iterative-SubString similarity + + Examples + -------- + >>> cmp = IterativeSubString() + >>> cmp.sim('cat', 'hat') + 0.0 + >>> cmp.sim('Niall', 'Neil') + 0.04999999999999999 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (self.corr(src, tar) + 1.0) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_jaccard.py b/abydos/distance/_jaccard.py index f4c3ef06c..fe65d33df 100644 --- a/abydos/distance/_jaccard.py +++ b/abydos/distance/_jaccard.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -30,7 +30,10 @@ from math import log +from deprecation import deprecated + from ._tversky import Tversky +from .. import __version__ __all__ = ['Jaccard', 'dist_jaccard', 'sim_jaccard', 'tanimoto'] @@ -39,16 +42,75 @@ class Jaccard(Tversky): r"""Jaccard similarity. For two sets X and Y, the Jaccard similarity coefficient - :cite:`Jaccard:1901` is :math:`sim_{Jaccard}(X, Y) = - \frac{|X \cap Y|}{|X \cup Y|}`. + :cite:`Jaccard:1901,Ruzicka:1958` is + + .. math:: + + sim_{Jaccard}(X, Y) = + \frac{|X \cap Y|}{|X \cup Y|}`. This is identical to the Tanimoto similarity coefficient :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for :math:`\alpha = \beta = 1`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Jaccard} = + \frac{a}{a+b+c} + + Notes + ----- + The multiset variant is termed Ellenberg similarity :cite:`Ellenberg:1956`. + + .. versionadded:: 0.3.6 + """ - def sim(self, src, tar, qval=2): + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Jaccard instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Jaccard, self).__init__( + alpha=1, + beta=1, + bias=None, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): r"""Return the Jaccard similarity of two strings. Parameters @@ -57,8 +119,6 @@ def sim(self, src, tar, qval=2): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version Returns ------- @@ -77,10 +137,15 @@ def sim(self, src, tar, qval=2): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return super(self.__class__, self).sim(src, tar, qval, 1, 1) + return super(Jaccard, self).sim(src, tar) - def tanimoto_coeff(self, src, tar, qval=2): + def tanimoto_coeff(self, src, tar): """Return the Tanimoto distance between two strings. Tanimoto distance :cite:`Tanimoto:1958` is @@ -92,8 +157,6 @@ def tanimoto_coeff(self, src, tar, qval=2): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version Returns ------- @@ -112,14 +175,25 @@ def tanimoto_coeff(self, src, tar, qval=2): >>> cmp.tanimoto_coeff('ATCG', 'TAGC') -inf + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - coeff = self.sim(src, tar, qval) + coeff = self.sim(src, tar) if coeff != 0: return log(coeff, 2) return float('-inf') +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Jaccard.sim method instead.', +) def sim_jaccard(src, tar, qval=2): """Return the Jaccard similarity of two strings. @@ -132,7 +206,7 @@ def sim_jaccard(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -150,10 +224,19 @@ def sim_jaccard(src, tar, qval=2): >>> sim_jaccard('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + """ - return Jaccard().sim(src, tar, qval) + return Jaccard(qval=qval).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Jaccard.dist method instead.', +) def dist_jaccard(src, tar, qval=2): """Return the Jaccard distance between two strings. @@ -166,7 +249,7 @@ def dist_jaccard(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -184,10 +267,19 @@ def dist_jaccard(src, tar, qval=2): >>> dist_jaccard('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.1.0 + """ - return Jaccard().dist(src, tar, qval) + return Jaccard(qval=qval).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Jaccard.tanimoto_coeff method instead.', +) def tanimoto(src, tar, qval=2): """Return the Tanimoto coefficient of two strings. @@ -200,7 +292,7 @@ def tanimoto(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -218,8 +310,11 @@ def tanimoto(src, tar, qval=2): >>> tanimoto('ATCG', 'TAGC') -inf + + .. versionadded:: 0.1.0 + """ - return Jaccard().tanimoto_coeff(src, tar, qval) + return Jaccard(qval=qval).tanimoto_coeff(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_jaccard_nm.py b/abydos/distance/_jaccard_nm.py new file mode 100644 index 000000000..e6c6bae67 --- /dev/null +++ b/abydos/distance/_jaccard_nm.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._jaccard_nm. + +Jaccard-NM similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['JaccardNM'] + + +class JaccardNM(_TokenDistance): + r"""Jaccard-NM similarity. + + For two sets X and Y and a population N, Jaccard-NM similarity + :cite:`Naseem:2011` is + + .. math:: + + sim_{JaccardNM}(X, Y) = + \frac{|X \cap Y|} + {|N| + |X \cap Y| + |X \setminus Y| + |Y \setminus X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{JaccardNM} = + \frac{a}{2(a+b+c)+d} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize JaccardNM instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(JaccardNM, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Jaccard-NM similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Jaccard-NM similarity + + Examples + -------- + >>> cmp = JaccardNM() + >>> cmp.sim_score('cat', 'hat') + 0.002531645569620253 + >>> cmp.sim_score('Niall', 'Neil') + 0.0025220680958385876 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.0012484394506866417 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + return a / (a + b + c + n) + + def sim(self, src, tar): + """Return the Jaccard-NM similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Jaccard-NM similarity + + Examples + -------- + >>> cmp = JaccardNM() + >>> cmp.sim('cat', 'hat') + 0.005063291139240506 + >>> cmp.sim('Niall', 'Neil') + 0.005044136191677175 + >>> cmp.sim('aluminum', 'Catalan') + 0.0024968789013732834 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return 2 * self.sim_score(src, tar) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_jaro_winkler.py b/abydos/distance/_jaro_winkler.py index e8c9d69f6..b5e84f092 100644 --- a/abydos/distance/_jaro_winkler.py +++ b/abydos/distance/_jaro_winkler.py @@ -32,9 +32,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._distance import _Distance +from .. import __version__ from ..tokenizer import QGrams __all__ = ['JaroWinkler', 'dist_jaro_winkler', 'sim_jaro_winkler'] @@ -50,26 +53,23 @@ class JaroWinkler(_Distance): http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c :cite:`Winkler:1994`. The above file is a US Government publication and, accordingly, in the public domain. + + .. versionadded:: 0.3.6 """ - def sim( + def __init__( self, - src, - tar, qval=1, mode='winkler', long_strings=False, boost_threshold=0.7, scaling_factor=0.1, + **kwargs ): - """Return the Jaro or Jaro-Winkler similarity of two strings. + """Initialize JaroWinkler instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison qval : int The length of each q-gram (defaults to 1: character-wise matching) mode : str @@ -94,6 +94,27 @@ def sim( for matching prefixes (defaults to 0.1). (Used in 'winkler' mode only.) + + .. versionadded:: 0.4.0 + + """ + super(JaroWinkler, self).__init__(**kwargs) + self._qval = qval + self._mode = mode + self._long_strings = long_strings + self._boost_threshold = boost_threshold + self._scaling_factor = scaling_factor + + def sim(self, src, tar): + """Return the Jaro or Jaro-Winkler similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + Returns ------- float @@ -128,14 +149,19 @@ def sim( >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12) 0.833333333333 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - if mode == 'winkler': - if boost_threshold > 1 or boost_threshold < 0: + if self._mode == 'winkler': + if self._boost_threshold > 1 or self._boost_threshold < 0: raise ValueError( 'Unsupported boost_threshold assignment; ' + 'boost_threshold must be between 0 and 1.' ) - if scaling_factor > 0.25 or scaling_factor < 0: + if self._scaling_factor > 0.25 or self._scaling_factor < 0: raise ValueError( 'Unsupported scaling_factor assignment; ' + 'scaling_factor must be between 0 and 0.25.' @@ -144,8 +170,8 @@ def sim( if src == tar: return 1.0 - src = QGrams(src.strip(), qval)._ordered_list - tar = QGrams(tar.strip(), qval)._ordered_list + src = QGrams(self._qval).tokenize(src.strip()).get_list() + tar = QGrams(self._qval).tokenize(tar.strip()).get_list() lens = len(src) lent = len(tar) @@ -205,21 +231,21 @@ def sim( # Continue to boost the weight if the strings are similar # This is the Winkler portion of Jaro-Winkler distance - if mode == 'winkler' and weight > boost_threshold: + if self._mode == 'winkler' and weight > self._boost_threshold: # Adjust for having up to the first 4 characters in common j = 4 if (minv >= 4) else minv i = 0 while (i < j) and (src[i] == tar[i]): i += 1 - weight += i * scaling_factor * (1.0 - weight) + weight += i * self._scaling_factor * (1.0 - weight) # Optionally adjust for long strings. # After agreeing beginning chars, at least two more must agree and # the agreeing characters must be > .5 of remaining characters. if ( - long_strings + self._long_strings and (minv > 4) and (num_com > i + 1) and (2 * num_com >= minv + i) @@ -231,6 +257,12 @@ def sim( return weight +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the JaroWinkler.sim method instead.', +) def sim_jaro_winkler( src, tar, @@ -297,12 +329,20 @@ def sim_jaro_winkler( >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12) 0.833333333333 + .. versionadded:: 0.1.0 + """ - return JaroWinkler().sim( - src, tar, qval, mode, long_strings, boost_threshold, scaling_factor - ) + return JaroWinkler( + qval, mode, long_strings, boost_threshold, scaling_factor + ).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the JaroWinkler.dist method instead.', +) def dist_jaro_winkler( src, tar, @@ -369,10 +409,12 @@ def dist_jaro_winkler( >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12) 0.166666666667 + .. versionadded:: 0.1.0 + """ - return JaroWinkler().dist( - src, tar, qval, mode, long_strings, boost_threshold, scaling_factor - ) + return JaroWinkler( + qval, mode, long_strings, boost_threshold, scaling_factor + ).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_jensen_shannon.py b/abydos/distance/_jensen_shannon.py new file mode 100644 index 000000000..e3ac357ec --- /dev/null +++ b/abydos/distance/_jensen_shannon.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._jensen_shannon. + +Jensen-Shannon divergence +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['JensenShannon'] + + +class JensenShannon(_TokenDistance): + r"""Jensen-Shannon divergence. + + Jensen-Shannon divergence :cite:`Dagan:1999` of two multi-sets X and Y is + + .. math:: + + \begin{array}{rl} + dist_{JS}(X, Y) &= log 2 + \frac{1}{2} \sum_{i \in X \cap Y} + h(p(X_i) + p(Y_i)) - h(p(X_i)) - h(p(Y_i)) + + h(x) &= -x log x + + p(X_i \in X) &= \frac{|X_i|}{|X|} + \end{array} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize JensenShannon instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(JensenShannon, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def dist_abs(self, src, tar): + """Return the Jensen-Shannon divergence of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Jensen-Shannon divergence + + Examples + -------- + >>> cmp = JensenShannon() + >>> cmp.dist_abs('cat', 'hat') + 0.3465735902799726 + >>> cmp.dist_abs('Niall', 'Neil') + 0.44051045978517045 + >>> cmp.dist_abs('aluminum', 'Catalan') + 0.6115216713968132 + >>> cmp.dist_abs('ATCG', 'TAGC') + 0.6931471805599453 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + def entropy(prob): + """Return the entropy of prob.""" + if not prob: + return 0.0 + return -(prob * log(prob)) + + src_total = sum(self._src_tokens.values()) + tar_total = sum(self._tar_tokens.values()) + + diverg = log(2) + for key in self._intersection().keys(): + p_src = self._src_tokens[key] / src_total + p_tar = self._tar_tokens[key] / tar_total + + diverg += ( + entropy(p_src + p_tar) - entropy(p_src) - entropy(p_tar) + ) / 2 + + return diverg + + def dist(self, src, tar): + """Return the normalized Jensen-Shannon distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Normalized Jensen-Shannon distance + + Examples + -------- + >>> cmp = JensenShannon() + >>> cmp.dist('cat', 'hat') + 0.49999999999999994 + >>> cmp.dist('Niall', 'Neil') + 0.6355222557917826 + >>> cmp.dist('aluminum', 'Catalan') + 0.8822392827203127 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + return self.dist_abs(src, tar) / log(2) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_johnson.py b/abydos/distance/_johnson.py new file mode 100644 index 000000000..be61f0c78 --- /dev/null +++ b/abydos/distance/_johnson.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._johnson. + +Johnson similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Johnson'] + + +class Johnson(_TokenDistance): + r"""Johnson similarity. + + For two sets X and Y, the Johnson + similarity :cite:`Johnson:1967` is + + .. math:: + + sim_{Johnson}(X, Y) = + \frac{(|X \cap Y|}{|X|} + \frac{|Y \cap X}{|Y|}`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Johnson} = + \frac{a}{a+b}+\frac{a}{a+c} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Johnson instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Johnson, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim_score(self, src, tar): + """Return the Johnson similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Johnson similarity + + Examples + -------- + >>> cmp = Johnson() + >>> cmp.sim_score('cat', 'hat') + 1.0 + >>> cmp.sim_score('Niall', 'Neil') + 0.7333333333333334 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.2361111111111111 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 2.0 + + self._tokenize(src, tar) + + if not self._src_card() or not self._tar_card(): + return 0.0 + + a = self._intersection_card() + ab = self._src_card() + ac = self._tar_card() + + return a / ab + a / ac + + def sim(self, src, tar): + """Return the normalized Johnson similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Johnson similarity + + Examples + -------- + >>> cmp = Johnson() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3666666666666667 + >>> cmp.sim('aluminum', 'Catalan') + 0.11805555555555555 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return self.sim_score(src, tar) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kendall_tau.py b/abydos/distance/_kendall_tau.py new file mode 100644 index 000000000..89b531856 --- /dev/null +++ b/abydos/distance/_kendall_tau.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kendall_tau. + +Kendall's Tau correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KendallTau'] + + +class KendallTau(_TokenDistance): + r"""Kendall's Tau correlation. + + For two sets X and Y and a population N, Kendall's Tau correlation + :cite:`Kendall:1938` is + + .. math:: + + corr_{KendallTau}(X, Y) = + \frac{2 \cdot (|X \cap Y| + |(N \setminus X) \setminus Y| - + |X \triangle Y|)}{|N| \cdot (|N|-1)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KendallTau} = + \frac{2 \cdot (a+d-b-c)}{n \cdot (n-1)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KendallTau instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KendallTau, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kendall's Tau correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kendall's Tau correlation + + Examples + -------- + >>> cmp = KendallTau() + >>> cmp.corr('cat', 'hat') + 0.0025282143508744493 + >>> cmp.corr('Niall', 'Neil') + 0.00250866630176975 + >>> cmp.corr('aluminum', 'Catalan') + 0.0024535291823735866 + >>> cmp.corr('ATCG', 'TAGC') + 0.0024891182526650506 + + Notes + ----- + This correlation is not necessarily bounded to [-1.0, 1.0], but will + typically be within these bounds for real data. + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + num = a + d - b - c + if num: + return 2 * num / (n * max(n - 1, 1)) + return 0.0 + + def sim(self, src, tar): + """Return the Kendall's Tau similarity of two strings. + + The Tau correlation is first clamped to the range [-1.0, 1.0] before + being converted to a similarity value to ensure that the similarity + is in the range [0.0, 1.0]. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kendall's Tau similarity + + Examples + -------- + >>> cmp = KendallTau() + >>> cmp.sim('cat', 'hat') + 0.5012641071754372 + >>> cmp.sim('Niall', 'Neil') + 0.5012543331508849 + >>> cmp.sim('aluminum', 'Catalan') + 0.5012267645911868 + >>> cmp.sim('ATCG', 'TAGC') + 0.5012445591263325 + + + .. versionadded:: 0.4.0 + + """ + score = max(-1.0, min(1.0, self.corr(src, tar))) + return (1.0 + score) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kent_foster_i.py b/abydos/distance/_kent_foster_i.py new file mode 100644 index 000000000..4ad54fbc7 --- /dev/null +++ b/abydos/distance/_kent_foster_i.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kent_foster_i. + +Kent & Foster I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KentFosterI'] + + +class KentFosterI(_TokenDistance): + r"""Kent & Foster I similarity. + + For two sets X and Y and a population N, Kent & Foster I similarity + :cite:`Kent:1977`, :math:`K_{occ}`, is + + .. math:: + + sim_{KentFosterI}(X, Y) = + \frac{|X \cap Y| - \frac{|X|\cdot|Y|}{|X \cup Y|}} + {|X \cap Y| - \frac{|X|\cdot|Y|}{|X \cup Y|} + + |X \setminus Y| + |Y \setminus X|} + + Kent & Foster derived this from Cohen's :math:`\kappa` by "subtracting + appropriate chance agreement correction figures from the numerators and + denominators" to arrive at an occurrence reliability measure. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{KentFosterI} = + \frac{a-\frac{(a+b)(a+c)}{a+b+c}}{a-\frac{(a+b)(a+c)}{a+b+c}+b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KentFosterI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KentFosterI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Kent & Foster I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kent & Foster I similarity + + Examples + -------- + >>> cmp = KentFosterI() + >>> cmp.sim_score('cat', 'hat') + -0.19999999999999996 + >>> cmp.sim_score('Niall', 'Neil') + -0.23529411764705888 + >>> cmp.sim_score('aluminum', 'Catalan') + -0.30434782608695654 + >>> cmp.sim_score('ATCG', 'TAGC') + -0.3333333333333333 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + num = (a + b) * (a + c) + if not num: + bigterm = a + else: + bigterm = a - (num / (a + b + c)) + + if bigterm: + return bigterm / (bigterm + b + c) + return 0.0 + + def sim(self, src, tar): + """Return the normalized Kent & Foster I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Kent & Foster I similarity + + Examples + -------- + >>> cmp = KentFosterI() + >>> cmp.sim('cat', 'hat') + 0.8 + >>> cmp.sim('Niall', 'Neil') + 0.7647058823529411 + >>> cmp.sim('aluminum', 'Catalan') + 0.6956521739130435 + >>> cmp.sim('ATCG', 'TAGC') + 0.6666666666666667 + + + .. versionadded:: 0.4.0 + + """ + return 1.0 + self.sim_score(src, tar) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kent_foster_ii.py b/abydos/distance/_kent_foster_ii.py new file mode 100644 index 000000000..52a6b5dd2 --- /dev/null +++ b/abydos/distance/_kent_foster_ii.py @@ -0,0 +1,200 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kent_foster_ii. + +Kent & Foster II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KentFosterII'] + + +class KentFosterII(_TokenDistance): + r"""Kent & Foster II similarity. + + For two sets X and Y and a population N, Kent & Foster II similarity + :cite:`Kent:1977`, :math:`K_{nonocc}`, is + + .. math:: + + sim_{KentFosterII}(X, Y) = + \frac{|(N \setminus X) \setminus Y| - + \frac{|X \setminus Y|\cdot|Y \setminus X|} + {|N \setminus (X \cap Y)|}} + {|(N \setminus X) \setminus Y| - + \frac{|X \setminus Y|\cdot|Y \setminus X|} + {|N \setminus (X \cap Y)|} + + |X \setminus Y| + |Y \setminus X|} + + Kent & Foster derived this from Cohen's :math:`\kappa` by "subtracting + appropriate chance agreement correction figures from the numerators and + denominators" to arrive at an non-occurrence reliability measure. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{KentFosterII} = + \frac{d-\frac{(b+d)(c+d)}{b+c+d}}{d-\frac{(b+d)(c+d)}{b+c+d}+b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KentFosterII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KentFosterII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Kent & Foster II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kent & Foster II similarity + + Examples + -------- + >>> cmp = KentFosterII() + >>> cmp.sim_score('cat', 'hat') + -0.0012804097311239404 + >>> cmp.sim_score('Niall', 'Neil') + -0.002196997436837158 + >>> cmp.sim_score('aluminum', 'Catalan') + -0.004784688995214218 + >>> cmp.sim_score('ATCG', 'TAGC') + -0.0031989763275758767 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = (b + d) * (c + d) + if not num: + bigterm = d + else: + bigterm = d - (num / (b + c + d)) + + if bigterm: + return bigterm / (bigterm + b + c) + return 0.0 + + def sim(self, src, tar): + """Return the normalized Kent & Foster II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Kent & Foster II similarity + + Examples + -------- + >>> cmp = KentFosterII() + >>> cmp.sim('cat', 'hat') + 0.998719590268876 + >>> cmp.sim('Niall', 'Neil') + 0.9978030025631628 + >>> cmp.sim('aluminum', 'Catalan') + 0.9952153110047858 + >>> cmp.sim('ATCG', 'TAGC') + 0.9968010236724241 + + + .. versionadded:: 0.4.0 + + """ + return 1.0 + self.sim_score(src, tar) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_koppen_i.py b/abydos/distance/_koppen_i.py new file mode 100644 index 000000000..dfd28f60b --- /dev/null +++ b/abydos/distance/_koppen_i.py @@ -0,0 +1,219 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._koppen_i. + +Köppen I correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KoppenI'] + + +class KoppenI(_TokenDistance): + r"""Köppen I correlation. + + For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`, + Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is + + .. math:: + + corr_{KoppenI}(X, Y) = + \frac{|X| \cdot |N \setminus X| - |X \setminus Y|} + {|X| \cdot |N \setminus X|} + + To support cases where :math:`|X| \neq |Y|`, this class implements a slight + variation, while still providing the expected results when + :math:`|X| = |Y|`: + + .. math:: + + corr_{KoppenI}(X, Y) = + \frac{\frac{|X|+|Y|}{2} \cdot + \frac{|N \setminus X|+|N \setminus Y|}{2}- + \frac{|X \triangle Y|}{2}} + {\frac{|X|+|Y|}{2} \cdot + \frac{|N \setminus X|+|N \setminus Y|}{2}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{KoppenI} = + \frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}- + \frac{b+c}{2}} + {\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}} + + Notes + ----- + In the usual case all of the above values should be proportional to the + total number of samples n. I.e., a, b, c, d, & n should all be divided by + n prior to calculating the coefficient. This class's default normalizer + is, accordingly, 'proportional'. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + normalizer='proportional', + **kwargs + ): + """Initialize KoppenI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + normalizer : str + Specifies the normalization type. See :ref:`normalizer ` + description in :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KoppenI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + normalizer=normalizer, + **kwargs + ) + + def corr(self, src, tar): + """Return the Köppen I correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Köppen I correlation + + Examples + -------- + >>> cmp = KoppenI() + >>> cmp.corr('cat', 'hat') + 0.49615384615384617 + >>> cmp.corr('Niall', 'Neil') + 0.3575056927658083 + >>> cmp.corr('aluminum', 'Catalan') + 0.1068520131813188 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483896 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4 + + num = abac_dbdc_mean_prod - (b + c) / 2 + if num: + return num / abac_dbdc_mean_prod + return 0.0 + + def sim(self, src, tar): + """Return the Köppen I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Köppen I similarity + + Examples + -------- + >>> cmp = KoppenI() + >>> cmp.sim('cat', 'hat') + 0.7480769230769231 + >>> cmp.sim('Niall', 'Neil') + 0.6787528463829041 + >>> cmp.sim('aluminum', 'Catalan') + 0.5534260065906594 + >>> cmp.sim('ATCG', 'TAGC') + 0.49679075738125805 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_koppen_ii.py b/abydos/distance/_koppen_ii.py new file mode 100644 index 000000000..9ce384815 --- /dev/null +++ b/abydos/distance/_koppen_ii.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._koppen_ii. + +Köppen II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KoppenII'] + + +class KoppenII(_TokenDistance): + r"""Köppen II similarity. + + For two sets X and Y, Köppen II similarity + :cite:`Koppen:1870,Goodman:1959` is + + .. math:: + + sim_{KoppenII}(X, Y) = + |X \cap Y| + \frac{|X \setminus Y| + |Y \setminus X|}{2} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{KoppenII} = + a + \frac{b+c}{2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KoppenII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KoppenII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Köppen II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Köppen II similarity + + Examples + -------- + >>> cmp = KoppenII() + >>> cmp.sim_score('cat', 'hat') + 4.0 + >>> cmp.sim_score('Niall', 'Neil') + 5.5 + >>> cmp.sim_score('aluminum', 'Catalan') + 8.5 + >>> cmp.sim_score('ATCG', 'TAGC') + 5.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return a + (b + c) / 2 + + def sim(self, src, tar): + """Return the normalized Köppen II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Köppen II similarity + + Examples + -------- + >>> cmp = KoppenII() + >>> cmp.sim('cat', 'hat') + 0.6666666666666666 + >>> cmp.sim('Niall', 'Neil') + 0.6111111111111112 + >>> cmp.sim('aluminum', 'Catalan') + 0.53125 + >>> cmp.sim('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + score = self.sim_score(src, tar) + return score / self._union_card() + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuder_richardson.py b/abydos/distance/_kuder_richardson.py new file mode 100644 index 000000000..69ab4d5f6 --- /dev/null +++ b/abydos/distance/_kuder_richardson.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuder_richardson. + +Kuder & Richardson correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuderRichardson'] + + +class KuderRichardson(_TokenDistance): + r"""Kuder & Richardson correlation. + + For two sets X and Y and a population N, Kuder & Richardson similarity + :cite:`Kuder:1937,Cronbach:1951` is + + .. math:: + + corr_{KuderRichardson}(X, Y) = + \frac{4(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)} + {|X| \cdot |N \setminus X| + + |Y| \cdot |N \setminus Y| + + 2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuderRichardson} = + \frac{4(ad-bc)}{(a+b)(c+d) + (a+c)(b+d) +2(ad-bc)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuderRichardson instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuderRichardson, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuder & Richardson correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuder & Richardson correlation + + Examples + -------- + >>> cmp = KuderRichardson() + >>> cmp.corr('cat', 'hat') + 0.6643835616438356 + >>> cmp.corr('Niall', 'Neil') + 0.5285677463699631 + >>> cmp.corr('aluminum', 'Catalan') + 0.19499521400246136 + >>> cmp.corr('ATCG', 'TAGC') + -0.012919896640826873 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + admbc = a * d - b * c + denom = (a + b) * (c + d) + (a + c) * (b + d) + 2 * admbc + + if not admbc: + return 0.0 + elif not denom: + return float('-inf') + else: + return (4 * admbc) / denom + + def sim(self, src, tar): + """Return the Kuder & Richardson similarity of two strings. + + Since Kuder & Richardson correlation is unbounded in the negative, + this measure is first clamped to [-1.0, 1.0]. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuder & Richardson similarity + + Examples + -------- + >>> cmp = KuderRichardson() + >>> cmp.sim('cat', 'hat') + 0.8321917808219178 + >>> cmp.sim('Niall', 'Neil') + 0.7642838731849815 + >>> cmp.sim('aluminum', 'Catalan') + 0.5974976070012307 + >>> cmp.sim('ATCG', 'TAGC') + 0.4935400516795866 + + + .. versionadded:: 0.4.0 + + """ + score = max(-1.0, self.corr(src, tar)) + return (1.0 + score) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_i.py b/abydos/distance/_kuhns_i.py new file mode 100644 index 000000000..a2698a45d --- /dev/null +++ b/abydos/distance/_kuhns_i.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_i. + +Kuhns I correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsI'] + + +class KuhnsI(_TokenDistance): + r"""Kuhns I correlation. + + For two sets X and Y and a population N, Kuhns I correlation + :cite:`Kuhns:1965`, the excess of separation over its independence value + (S), is + + .. math:: + + corr_{KuhnsI}(X, Y) = + \frac{2\delta(X, Y)}{|N|} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsI} = + \frac{2\delta(a+b, a+c)}{n} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns I correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns I correlation + + Examples + -------- + >>> cmp = KuhnsI() + >>> cmp.corr('cat', 'hat') + 0.005049979175343606 + >>> cmp.corr('Niall', 'Neil') + 0.005004425239483548 + >>> cmp.corr('aluminum', 'Catalan') + 0.0023140898210880765 + >>> cmp.corr('ATCG', 'TAGC') + -8.134631403581842e-05 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + return 2 * delta_ab / n + + def sim(self, src, tar): + """Return the Kuhns I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns I similarity + + Examples + -------- + >>> cmp = KuhnsI() + >>> cmp.sim('cat', 'hat') + 0.5050499791753436 + >>> cmp.sim('Niall', 'Neil') + 0.5050044252394835 + >>> cmp.sim('aluminum', 'Catalan') + 0.502314089821088 + >>> cmp.sim('ATCG', 'TAGC') + 0.49991865368596416 + + + .. versionadded:: 0.4.0 + + """ + return 0.5 + self.corr(src, tar) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_ii.py b/abydos/distance/_kuhns_ii.py new file mode 100644 index 000000000..82aa8f760 --- /dev/null +++ b/abydos/distance/_kuhns_ii.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_ii. + +Kuhns II correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsII'] + + +class KuhnsII(_TokenDistance): + r"""Kuhns II correlation. + + For two sets X and Y and a population N, Kuhns II correlation + :cite:`Kuhns:1965`, the excess of rectangular distance over its + independence value (R), is + + .. math:: + + corr_{KuhnsII}(X, Y) = + \frac{\delta(X, Y)}{max(|X|, |Y|)} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsII} = + \frac{\delta(a+b, a+c)}{max(a+b, a+c)} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns II correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns II correlation + + Examples + -------- + >>> cmp = KuhnsII() + >>> cmp.corr('cat', 'hat') + 0.49489795918367346 + >>> cmp.corr('Niall', 'Neil') + 0.32695578231292516 + >>> cmp.corr('aluminum', 'Catalan') + 0.10092002830856334 + >>> cmp.corr('ATCG', 'TAGC') + -0.006377551020408163 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + return delta_ab / (max(a + b, a + c)) + + def sim(self, src, tar): + """Return the Kuhns II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns II similarity + + Examples + -------- + >>> cmp = KuhnsII() + >>> cmp.sim('cat', 'hat') + 0.663265306122449 + >>> cmp.sim('Niall', 'Neil') + 0.5513038548752834 + >>> cmp.sim('aluminum', 'Catalan') + 0.40061335220570893 + >>> cmp.sim('ATCG', 'TAGC') + 0.32908163265306123 + + + .. versionadded:: 0.4.0 + + """ + return (0.5 + self.corr(src, tar)) / 1.5 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_iii.py b/abydos/distance/_kuhns_iii.py new file mode 100644 index 000000000..759afe59c --- /dev/null +++ b/abydos/distance/_kuhns_iii.py @@ -0,0 +1,216 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_iii. + +Kuhns III correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsIII'] + + +class KuhnsIII(_TokenDistance): + r"""Kuhns III correlation. + + For two sets X and Y and a population N, Kuhns III correlation + :cite:`Kuhns:1965`, the excess of proportion of overlap over its + independence value (P), is + + .. math:: + + corr_{KuhnsIII}(X, Y) = + \frac{\delta(X, Y)}{\big(1-\frac{|X \cap Y|}{|X|+|Y|}\big) + \big(|X|+|Y|-\frac{|X|\cdot|Y|}{|N|}\big)} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsIII} = + \frac{\delta(a+b, a+c)}{\big(1-\frac{a}{2a+b+c}\big) + \big(2a+b+c-\frac{(a+b)(a+c)}{n}\big)} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + Notes + ----- + The coefficient presented in :cite:`Eidenberger:2014,Morris:2012` as Kuhns' + "Proportion of overlap above independence" is a significantly different + coefficient, not evidenced in :cite:`Kuhns:1965`. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns III correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns III correlation + + Examples + -------- + >>> cmp = KuhnsIII() + >>> cmp.corr('cat', 'hat') + 0.3307757885763001 + >>> cmp.corr('Niall', 'Neil') + 0.21873141468207793 + >>> cmp.corr('aluminum', 'Catalan') + 0.05707545392902886 + >>> cmp.corr('ATCG', 'TAGC') + -0.003198976327575176 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + return delta_ab / ( + (1 - a / (2 * a + b + c)) + * (2 * a + b + c - ((a + b) * (a + c) / n)) + ) + + def sim(self, src, tar): + """Return the Kuhns III similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns III similarity + + Examples + -------- + >>> cmp = KuhnsIII() + >>> cmp.sim('cat', 'hat') + 0.498081841432225 + >>> cmp.sim('Niall', 'Neil') + 0.41404856101155846 + >>> cmp.sim('aluminum', 'Catalan') + 0.29280659044677165 + >>> cmp.sim('ATCG', 'TAGC') + 0.24760076775431863 + + + .. versionadded:: 0.4.0 + + """ + return (1 / 3 + self.corr(src, tar)) / (4 / 3) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_iv.py b/abydos/distance/_kuhns_iv.py new file mode 100644 index 000000000..05ee9360d --- /dev/null +++ b/abydos/distance/_kuhns_iv.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_iv. + +Kuhns IV correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsIV'] + + +class KuhnsIV(_TokenDistance): + r"""Kuhns IV correlation. + + For two sets X and Y and a population N, Kuhns IV correlation + :cite:`Kuhns:1965`, the excess of conditional probabilities over its + independence value (W), is + + .. math:: + + corr_{KuhnsIV}(X, Y) = + \frac{\delta(X, Y)}{min(|X|, |Y|)} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsIV} = + \frac{\delta(a+b, a+c)}{min(a+b, a+c)} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsIV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsIV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns IV correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns IV correlation + + Examples + -------- + >>> cmp = KuhnsIV() + >>> cmp.corr('cat', 'hat') + 0.49489795918367346 + >>> cmp.corr('Niall', 'Neil') + 0.3923469387755102 + >>> cmp.corr('aluminum', 'Catalan') + 0.11353503184713376 + >>> cmp.corr('ATCG', 'TAGC') + -0.006377551020408163 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + return delta_ab / (min(a + b, a + c)) + + def sim(self, src, tar): + """Return the Kuhns IV similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns IV similarity + + Examples + -------- + >>> cmp = KuhnsIV() + >>> cmp.sim('cat', 'hat') + 0.7474489795918368 + >>> cmp.sim('Niall', 'Neil') + 0.696173469387755 + >>> cmp.sim('aluminum', 'Catalan') + 0.5567675159235669 + >>> cmp.sim('ATCG', 'TAGC') + 0.4968112244897959 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_ix.py b/abydos/distance/_kuhns_ix.py new file mode 100644 index 000000000..5cd309ba2 --- /dev/null +++ b/abydos/distance/_kuhns_ix.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_ix. + +Kuhns IX correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsIX'] + + +class KuhnsIX(_TokenDistance): + r"""Kuhns IX correlation. + + For two sets X and Y and a population N, Kuhns IX correlation + :cite:`Kuhns:1965`, the excess of coefficient of linear correlation over + its independence value (L), is + + .. math:: + + corr_{KuhnsIX}(X, Y) = + \frac{\delta(X, Y)}{\sqrt{|X|\cdot|Y|\cdot(1-\frac{|X|}{|N|}) + \cdot(1-\frac{|Y|}{|N|})}} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsIX} = + \frac{\delta(a+b, a+c)}{\sqrt{(a+b)(a+c)(1-\frac{a+b}{n}) + (1-\frac{a+c}{n})}} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsIX instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsIX, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns IX correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns IX correlation + + Examples + -------- + >>> cmp = KuhnsIX() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.36069255713421955 + >>> cmp.corr('aluminum', 'Catalan') + 0.10821361655002706 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + marginals_product = ( + max(1, a + b) * max(1, a + c) * max(1, b + d) * max(1, c + d) + ) + # clamp to [-1.0, 1.0], strictly due to floating point precision + # issues + return max( + -1.0, min(1.0, (delta_ab * n / (marginals_product ** 0.5))) + ) + + def sim(self, src, tar): + """Return the Kuhns IX similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns IX similarity + + Examples + -------- + >>> cmp = KuhnsIX() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6803462785671097 + >>> cmp.sim('aluminum', 'Catalan') + 0.5541068082750136 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_v.py b/abydos/distance/_kuhns_v.py new file mode 100644 index 000000000..d119c25e5 --- /dev/null +++ b/abydos/distance/_kuhns_v.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_v. + +Kuhns V correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsV'] + + +class KuhnsV(_TokenDistance): + r"""Kuhns V correlation. + + For two sets X and Y and a population N, Kuhns V correlation + :cite:`Kuhns:1965`, the excess of probability differences U over its + independence value (U), is + + .. math:: + + corr_{KuhnsV}(X, Y) = + \frac{\delta(X, Y)} + {max\big(|X|\cdot(1-\frac{|X|}{|N|}), + |Y|\cdot(1-\frac{|Y|}{|N|})\big)} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsV} = + \frac{\delta(a+b, a+c)} + {max\big((a+b)(1-\frac{a+b}{n}), (a+c)(1-\frac{a+c}{n})\big)} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns V correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns V correlation + + Examples + -------- + >>> cmp = KuhnsV() + >>> cmp.corr('cat', 'hat') + 0.497435897435897 + >>> cmp.corr('Niall', 'Neil') + 0.329477292202228 + >>> cmp.corr('aluminum', 'Catalan') + 0.10209049255441 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237484 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + # clamp to [-1.0, 1.0], strictly due to floating point precision + # issues + return round( + max( + -1.0, + min( + 1.0, + delta_ab + * n + / max( + max(1, a + b) * max(1, c + d), + max(1, a + c) * max(1, b + d), + ), + ), + ), + 15, + ) + + def sim(self, src, tar): + """Return the Kuhns V similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns V similarity + + Examples + -------- + >>> cmp = KuhnsV() + >>> cmp.sim('cat', 'hat') + 0.7487179487179485 + >>> cmp.sim('Niall', 'Neil') + 0.664738646101114 + >>> cmp.sim('aluminum', 'Catalan') + 0.551045246277205 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_vi.py b/abydos/distance/_kuhns_vi.py new file mode 100644 index 000000000..f0321f9ec --- /dev/null +++ b/abydos/distance/_kuhns_vi.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_vi. + +Kuhns VI correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsVI'] + + +class KuhnsVI(_TokenDistance): + r"""Kuhns VI correlation. + + For two sets X and Y and a population N, Kuhns VI correlation + :cite:`Kuhns:1965`, the excess of probability differences V over its + independence value (V), is + + .. math:: + + corr_{KuhnsVI}(X, Y) = + \frac{\delta(X, Y)} + {min\big(|X|\cdot(1-\frac{|X|}{|N|}), |Y|(1-\frac{|Y|}{|N|})\big)} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsVI} = + \frac{\delta(a+b, a+c)} + {min\big((a+b)(1-\frac{a+b}{n}), (a+c)(1-\frac{a+c}{n})\big)} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsVI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsVI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns VI correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns VI correlation + + Examples + -------- + >>> cmp = KuhnsVI() + >>> cmp.corr('cat', 'hat') + 0.497435897435897 + >>> cmp.corr('Niall', 'Neil') + 0.394865211810013 + >>> cmp.corr('aluminum', 'Catalan') + 0.11470398970399 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237484 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + # clamp to [-1.0, 1.0], strictly due to floating point precision + # issues + return round( + max( + -1.0, + min( + 1.0, + delta_ab + * n + / min( + max(1, a + b) * max(1, c + d), + max(1, a + c) * max(1, b + d), + ), + ), + ), + 15, + ) + + def sim(self, src, tar): + """Return the Kuhns VI similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns VI similarity + + Examples + -------- + >>> cmp = KuhnsVI() + >>> cmp.sim('cat', 'hat') + 0.7487179487179485 + >>> cmp.sim('Niall', 'Neil') + 0.6974326059050064 + >>> cmp.sim('aluminum', 'Catalan') + 0.557351994851995 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_vii.py b/abydos/distance/_kuhns_vii.py new file mode 100644 index 000000000..fa169e332 --- /dev/null +++ b/abydos/distance/_kuhns_vii.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_vii. + +Kuhns VII correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsVII'] + + +class KuhnsVII(_TokenDistance): + r"""Kuhns VII correlation. + + For two sets X and Y and a population N, Kuhns VII correlation + :cite:`Kuhns:1965`, the excess of angle between vector over its + independence value (G), is + + .. math:: + + corr_{KuhnsVII}(X, Y) = + \frac{\delta(X, Y)}{\sqrt{|X|\cdot|Y|}} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsVII} = + \frac{\delta(a+b, a+c)}{\sqrt{(a+b)(a+c)}} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsVII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsVII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns VII correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns VII correlation + + Examples + -------- + >>> cmp = KuhnsVII() + >>> cmp.corr('cat', 'hat') + 0.49489795918367346 + >>> cmp.corr('Niall', 'Neil') + 0.3581621145590755 + >>> cmp.corr('aluminum', 'Catalan') + 0.10704185456178524 + >>> cmp.corr('ATCG', 'TAGC') + -0.006377551020408163 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + return delta_ab / ((a + b) * (a + c)) ** 0.5 + + def sim(self, src, tar): + """Return the Kuhns VII similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns VII similarity + + Examples + -------- + >>> cmp = KuhnsVII() + >>> cmp.sim('cat', 'hat') + 0.663265306122449 + >>> cmp.sim('Niall', 'Neil') + 0.572108076372717 + >>> cmp.sim('aluminum', 'Catalan') + 0.40469456970785683 + >>> cmp.sim('ATCG', 'TAGC') + 0.32908163265306123 + + + .. versionadded:: 0.4.0 + + """ + return (0.5 + self.corr(src, tar)) / 1.5 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_viii.py b/abydos/distance/_kuhns_viii.py new file mode 100644 index 000000000..93a240f0c --- /dev/null +++ b/abydos/distance/_kuhns_viii.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_viii. + +Kuhns VIII correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsVIII'] + + +class KuhnsVIII(_TokenDistance): + r"""Kuhns VIII correlation. + + For two sets X and Y and a population N, Kuhns VIII correlation + :cite:`Kuhns:1965`, the excess of coefficient by the arithmetic mean over + its independence value (E), is + + .. math:: + + corr_{KuhnsVIII}(X, Y) = + \frac{\delta(X, Y)}{|X \cap Y|+\frac{1}{2}\cdot|X \triangle Y|} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsVIII} = + \frac{\delta(a+b, a+c)}{a+\frac{1}{2}(b+c)} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + Notes + ----- + The coefficient presented in :cite:`Eidenberger:2014,Morris:2012` as Kuhns' + "Coefficient of arithmetic means" is a significantly different + coefficient, not evidenced in :cite:`Kuhns:1965`. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsVIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsVIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns VIII correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns VIII correlation + + Examples + -------- + >>> cmp = KuhnsVIII() + >>> cmp.corr('cat', 'hat') + 0.49489795918367346 + >>> cmp.corr('Niall', 'Neil') + 0.35667903525046385 + >>> cmp.corr('aluminum', 'Catalan') + 0.10685650056200824 + >>> cmp.corr('ATCG', 'TAGC') + -0.006377551020408163 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + return delta_ab / (a + 0.5 * (b + c)) + + def sim(self, src, tar): + """Return the Kuhns VIII similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns VIII similarity + + Examples + -------- + >>> cmp = KuhnsVIII() + >>> cmp.sim('cat', 'hat') + 0.663265306122449 + >>> cmp.sim('Niall', 'Neil') + 0.5711193568336426 + >>> cmp.sim('aluminum', 'Catalan') + 0.40457100037467214 + >>> cmp.sim('ATCG', 'TAGC') + 0.32908163265306123 + + + .. versionadded:: 0.4.0 + + """ + return (0.5 + self.corr(src, tar)) / 1.5 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_x.py b/abydos/distance/_kuhns_x.py new file mode 100644 index 000000000..07b7955a6 --- /dev/null +++ b/abydos/distance/_kuhns_x.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_x. + +Kuhns X correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsX'] + + +class KuhnsX(_TokenDistance): + r"""Kuhns X correlation. + + For two sets X and Y and a population N, Kuhns X correlation + :cite:`Kuhns:1965`, the excess of Yule's Q over its independence value (Q), + is + + .. math:: + + corr_{KuhnsX}(X, Y) = + \frac{|N| \cdot \delta(X, Y)}{|X \cap Y| \cdot + |(N \setminus X) \setminus Y| + + |X \setminus Y| \cdot |Y \setminus X|} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsX} = + \frac{n \cdot \delta(a+b, a+c)}{ad+bc} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsX instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsX, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns X correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns X correlation + + Examples + -------- + >>> cmp = KuhnsX() + >>> cmp.corr('cat', 'hat') + 0.994871794871795 + >>> cmp.corr('Niall', 'Neil') + 0.984635083226633 + >>> cmp.corr('aluminum', 'Catalan') + 0.864242424242424 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = max(1, self._total_complement_card()) + n = self._population_unique_card() + if a == n: + n += 1 + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + # clamp to [-1.0, 1.0], strictly due to floating point precision + # issues + return round( + max(-1.0, min(1.0, (n * delta_ab) / (a * d + b * c))), 15 + ) + + def sim(self, src, tar): + """Return the Kuhns X similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns X similarity + + Examples + -------- + >>> cmp = KuhnsX() + >>> cmp.sim('cat', 'hat') + 0.9974358974358974 + >>> cmp.sim('Niall', 'Neil') + 0.9923175416133165 + >>> cmp.sim('aluminum', 'Catalan') + 0.932121212121212 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_xi.py b/abydos/distance/_kuhns_xi.py new file mode 100644 index 000000000..c77be55d5 --- /dev/null +++ b/abydos/distance/_kuhns_xi.py @@ -0,0 +1,219 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_xi. + +Kuhns XI correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsXI'] + + +class KuhnsXI(_TokenDistance): + r"""Kuhns XI correlation. + + For two sets X and Y and a population N, Kuhns XI correlation + :cite:`Kuhns:1965`, the excess of Yule's Y over its independence value (Y), + is + + .. math:: + + corr_{KuhnsXI}(X, Y) = + \frac{|N| \cdot \delta(X, Y)}{(\sqrt{|X \cap Y| \cdot + |(N \setminus X) \setminus Y|} + + \sqrt{|X \setminus Y| \cdot |Y \setminus X|})^2} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{KuhnsXI} = + \frac{n \cdot \delta(a+b, a+c)}{(\sqrt{ad}+\sqrt{bc})^2} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsXI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsXI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Kuhns XI correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns XI correlation + + Examples + -------- + >>> cmp = KuhnsXI() + >>> cmp.corr('cat', 'hat') + 0.9034892632818761 + >>> cmp.corr('Niall', 'Neil') + 0.8382551144735259 + >>> cmp.corr('aluminum', 'Catalan') + 0.5749826820237787 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + # clamp to [-1.0, 1.0], strictly due to floating point precision + # issues + return max( + -1.0, + min( + 1.0, + (n * delta_ab) + / max(1.0, ((a * d) ** 0.5 + (b * c) ** 0.5) ** 2), + ), + ) + + def sim(self, src, tar): + """Return the Kuhns XI similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns XI similarity + + Examples + -------- + >>> cmp = KuhnsXI() + >>> cmp.sim('cat', 'hat') + 0.951744631640938 + >>> cmp.sim('Niall', 'Neil') + 0.919127557236763 + >>> cmp.sim('aluminum', 'Catalan') + 0.7874913410118893 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kuhns_xii.py b/abydos/distance/_kuhns_xii.py new file mode 100644 index 000000000..21b5f270b --- /dev/null +++ b/abydos/distance/_kuhns_xii.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kuhns_xii. + +Kuhns XII similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KuhnsXII'] + + +class KuhnsXII(_TokenDistance): + r"""Kuhns XII similarity. + + For two sets X and Y and a population N, Kuhns XII similarity + :cite:`Kuhns:1965`, the excess of index of independence over its + independence value (I), is + + .. math:: + + sim_{KuhnsXII}(X, Y) = + \frac{|N| \cdot \delta(X, Y)}{|X| \cdot |Y|} + + where + + .. math:: + + \delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{KuhnsXII} = + \frac{n \cdot \delta(a+b, a+c)}{(a+b)(a+c)} + + where + + .. math:: + + \delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize KuhnsXII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KuhnsXII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Kuhns XII similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kuhns XII similarity + + Examples + -------- + >>> cmp = KuhnsXII() + >>> cmp.sim_score('cat', 'hat') + 97.0 + >>> cmp.sim_score('Niall', 'Neil') + 51.266666666666666 + >>> cmp.sim_score('aluminum', 'Catalan') + 9.902777777777779 + >>> cmp.sim_score('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + apbmapc = (a + b) * (a + c) + if not apbmapc: + delta_ab = a + else: + delta_ab = a - apbmapc / n + if not delta_ab: + return 0.0 + else: + return max(-1.0, n * delta_ab / ((a + b) * (a + c))) + + def sim(self, src, tar): + """Return the normalized Kuhns XII similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Kuhns XII similarity + + Examples + -------- + >>> cmp = KuhnsXII() + >>> cmp.sim('cat', 'hat') + 0.2493573264781491 + >>> cmp.sim('Niall', 'Neil') + 0.1323010752688172 + >>> cmp.sim('aluminum', 'Catalan') + 0.012877474353417137 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + score = self.sim_score(src, tar) + minval, maxval = sorted( + [self._intersection_card(), self._total_complement_card()] + ) + if score < 0.0: + return min(1.0, (1.0 + score) / 2.0) + norm = 1.0 + if minval and maxval: + norm = maxval / minval + return min(1.0, score / norm) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kulczynski_i.py b/abydos/distance/_kulczynski_i.py new file mode 100644 index 000000000..a22193b68 --- /dev/null +++ b/abydos/distance/_kulczynski_i.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kulczynski_i. + +Kulczynski I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KulczynskiI'] + + +class KulczynskiI(_TokenDistance): + r"""Kulczynski I similarity. + + For two sets X and Y, Kulczynski I similarity + :cite:`Kulczynski:1927` is + + .. math:: + + sim_{KulczynskiI}(X, Y) = + \frac{|X \cap Y|}{|X \setminus Y| + |Y \setminus X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{KulczynskiI} = + \frac{a}{b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize KulczynskiI instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KulczynskiI, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim_score(self, src, tar): + """Return the Kulczynski I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kulczynski I similarity + + Examples + -------- + >>> cmp = KulczynskiI() + >>> cmp.sim_score('cat', 'hat') + 0.5 + >>> cmp.sim_score('Niall', 'Neil') + 0.2857142857142857 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.06666666666666667 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + if not a: + return 0.0 + if not b + c: + return float('inf') + return a / (b + c) + + def sim(self, *args, **kwargs): + """Raise exception when called. + + Parameters + ---------- + *args + Variable length argument list + **kwargs + Arbitrary keyword arguments + + Raises + ------ + NotImplementedError + Method disabled for Kulczynski I similarity. + + + .. versionadded:: 0.3.6 + + """ + raise NotImplementedError( + 'Method disabled for Kulczynski I similarity.' + ) + + def dist(self, *args, **kwargs): + """Raise exception when called. + + Parameters + ---------- + *args + Variable length argument list + **kwargs + Arbitrary keyword arguments + + Raises + ------ + NotImplementedError + Method disabled for Kulczynski I similarity. + + + .. versionadded:: 0.3.6 + + """ + raise NotImplementedError( + 'Method disabled for Kulczynski I similarity.' + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_kulczynski_ii.py b/abydos/distance/_kulczynski_ii.py new file mode 100644 index 000000000..075eb48a0 --- /dev/null +++ b/abydos/distance/_kulczynski_ii.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._kulczynski_ii. + +Kulczynski II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['KulczynskiII'] + + +class KulczynskiII(_TokenDistance): + r"""Kulczynski II similarity. + + For two sets X and Y, Kulczynski II similarity :cite:`Kulczynski:1927` or + Driver & Kroeber similarity :cite:`Driver:1932` is + + .. math:: + + sim_{KulczynskiII}(X, Y) = + \frac{1}{2} + \Bigg(\frac{|X \cap Y|}{|X|} + \frac{|X \cap Y|}{|Y|}\Bigg) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{KulczynskiII} = + \frac{1}{2}\Bigg(\frac{a}{a+b}+\frac{a}{a+c}\Bigg) + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize KulczynskiII instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(KulczynskiII, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Kulczynski II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Kulczynski II similarity + + Examples + -------- + >>> cmp = KulczynskiII() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3666666666666667 + >>> cmp.sim('aluminum', 'Catalan') + 0.11805555555555555 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + apb = self._src_card() + apc = self._tar_card() + + if not apb or not apc: + return 0.0 + + return 0.5 * (a / apb + a / apc) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_lcprefix.py b/abydos/distance/_lcprefix.py new file mode 100644 index 000000000..9a41d3ef4 --- /dev/null +++ b/abydos/distance/_lcprefix.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._lcprefix. + +Longest common prefix +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from os.path import commonprefix + +from six import text_type + +from ._distance import _Distance + +__all__ = ['LCPrefix'] + + +class LCPrefix(_Distance): + """Longest common prefix. + + .. versionadded:: 0.4.0 + """ + + def lcprefix(self, strings): + """Return the longest common prefix of a list of strings. + + Longest common prefix (LCPrefix). + + Parameters + ---------- + strings : list of strings + Strings for comparison + + Returns + ------- + str + The longest common prefix + + Examples + -------- + >>> pfx = LCPrefix() + >>> pfx.lcprefix(['cat', 'hat']) + '' + >>> pfx.lcprefix(['Niall', 'Neil']) + 'N' + >>> pfx.lcprefix(['aluminum', 'Catalan']) + '' + >>> pfx.lcprefix(['ATCG', 'TAGC']) + '' + + + .. versionadded:: 0.4.0 + + """ + return commonprefix(strings) + + def dist_abs(self, src, tar, *args): + """Return the length of the longest common prefix of the strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + *args : strs + Additional strings for comparison + + Raises + ------ + ValueError + All arguments must be of type str + + Returns + ------- + int + The length of the longest common prefix + + Examples + -------- + >>> pfx = LCPrefix() + >>> pfx.dist_abs('cat', 'hat') + 0 + >>> pfx.dist_abs('Niall', 'Neil') + 1 + >>> pfx.dist_abs('aluminum', 'Catalan') + 0 + >>> pfx.dist_abs('ATCG', 'TAGC') + 0 + + + .. versionadded:: 0.4.0 + + """ + strings = [text_type(src), text_type(tar)] + for arg in args: + if isinstance(arg, (str, text_type)): + strings.append(text_type(arg)) + else: + raise TypeError('All arguments must be of type str') + + return len(self.lcprefix(strings)) + + def sim(self, src, tar, *args): + r"""Return the longest common prefix similarity of two or more strings. + + Longest common prefix similarity (:math:`sim_{LCPrefix}`). + + This employs the LCPrefix function to derive a similarity metric: + :math:`sim_{LCPrefix}(s,t) = \frac{|LCPrefix(s,t)|}{max(|s|, |t|)}` + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + *args : strs + Additional strings for comparison + + Returns + ------- + float + LCPrefix similarity + + Examples + -------- + >>> pfx = LCPrefix() + >>> pfx.sim('cat', 'hat') + 0.0 + >>> pfx.sim('Niall', 'Neil') + 0.2 + >>> pfx.sim('aluminum', 'Catalan') + 0.0 + >>> pfx.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + elif not src or not tar: + return 0.0 + dist = self.dist_abs(src, tar, *args) + maxlen = max(len(src), len(tar), *[len(arg) for arg in args]) + return dist / maxlen + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_lcsseq.py b/abydos/distance/_lcsseq.py index 005eba0cf..d829f49e3 100644 --- a/abydos/distance/_lcsseq.py +++ b/abydos/distance/_lcsseq.py @@ -28,10 +28,13 @@ unicode_literals, ) +from deprecation import deprecated + from numpy import int as np_int from numpy import zeros as np_zeros from ._distance import _Distance +from .. import __version__ __all__ = ['LCSseq', 'dist_lcsseq', 'lcsseq', 'sim_lcsseq'] @@ -41,8 +44,31 @@ class LCSseq(_Distance): Longest common subsequence (LCSseq) is the longest subsequence of characters that two strings have in common. + + .. versionadded:: 0.3.6 """ + def __init__(self, normalizer=max, **kwargs): + r"""Initialize LCSseq. + + Parameters + ---------- + normalizer : function + A normalization function for the normalized similarity & distance. + By default, the max of the lengths of the input strings. If + lambda x: sum(x)/2.0 is supplied, the normalization proposed in + :cite:`Radev:2001` is used, i.e. + :math:`\frac{2 \dot |LCS(src, tar)|}{|src| + |tar|}`. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(LCSseq, self).__init__(**kwargs) + self._normalizer = normalizer + def lcsseq(self, src, tar): """Return the longest common subsequence of two strings. @@ -77,6 +103,11 @@ def lcsseq(self, src, tar): >>> sseq.lcsseq('ATCG', 'TAGC') 'AC' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) @@ -136,14 +167,28 @@ def sim(self, src, tar): >>> sseq.sim('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + .. versionchanged:: 0.4.0 + Added normalization option + """ if src == tar: return 1.0 elif not src or not tar: return 0.0 - return len(self.lcsseq(src, tar)) / max(len(src), len(tar)) + return len(self.lcsseq(src, tar)) / self._normalizer( + [len(src), len(tar)] + ) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the LCSseq.lcsseq method instead.', +) def lcsseq(src, tar): """Return the longest common subsequence of two strings. @@ -172,10 +217,18 @@ def lcsseq(src, tar): >>> lcsseq('ATCG', 'TAGC') 'AC' + .. versionadded:: 0.1.0 + """ return LCSseq().lcsseq(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the LCSseq.sim method instead.', +) def sim_lcsseq(src, tar): r"""Return the longest common subsequence similarity of two strings. @@ -204,10 +257,18 @@ def sim_lcsseq(src, tar): >>> sim_lcsseq('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + """ return LCSseq().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the LCSseq.dist method instead.', +) def dist_lcsseq(src, tar): """Return the longest common subsequence distance between two strings. diff --git a/abydos/distance/_lcsstr.py b/abydos/distance/_lcsstr.py index d20db63ff..8e1821ea9 100644 --- a/abydos/distance/_lcsstr.py +++ b/abydos/distance/_lcsstr.py @@ -28,18 +28,45 @@ unicode_literals, ) +from deprecation import deprecated + from numpy import int as np_int from numpy import zeros as np_zeros from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = ['LCSstr', 'dist_lcsstr', 'lcsstr', 'sim_lcsstr'] class LCSstr(_Distance): - """Longest common substring.""" + """Longest common substring. + + .. versionadded:: 0.3.6 + """ + + def __init__(self, normalizer=max, **kwargs): + r"""Initialize LCSseq. + + Parameters + ---------- + normalizer : function + A normalization function for the normalized similarity & distance. + By default, the max of the lengths of the input strings. If + lambda x: sum(x)/2.0 is supplied, the normalization proposed in + :cite:`Radev:2001` is used, i.e. + :math:`\frac{2 \dot |LCS(src, tar)|}{|src| + |tar|}`. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(LCSstr, self).__init__(**kwargs) + self._normalizer = normalizer def lcsstr(self, src, tar): """Return the longest common substring of two strings. @@ -80,6 +107,11 @@ def lcsstr(self, src, tar): >>> sstr.lcsstr('ATCG', 'TAGC') 'A' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) longest, i_longest = 0, 0 @@ -116,23 +148,39 @@ def sim(self, src, tar): Examples -------- - >>> sim_lcsstr('cat', 'hat') + >>> sstr = LCSstr() + >>> sstr.sim('cat', 'hat') 0.6666666666666666 - >>> sim_lcsstr('Niall', 'Neil') + >>> sstr.sim('Niall', 'Neil') 0.2 - >>> sim_lcsstr('aluminum', 'Catalan') + >>> sstr.sim('aluminum', 'Catalan') 0.25 - >>> sim_lcsstr('ATCG', 'TAGC') + >>> sstr.sim('ATCG', 'TAGC') 0.25 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + .. versionchanged:: 0.4.0 + Added normalization option + """ if src == tar: return 1.0 elif not src or not tar: return 0.0 - return len(self.lcsstr(src, tar)) / max(len(src), len(tar)) + return len(self.lcsstr(src, tar)) / self._normalizer( + [len(src), len(tar)] + ) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the LCSstr.lcsstr method instead.', +) def lcsstr(src, tar): """Return the longest common substring of two strings. @@ -161,10 +209,18 @@ def lcsstr(src, tar): >>> lcsstr('ATCG', 'TAGC') 'A' + .. versionadded:: 0.1.0 + """ return LCSstr().lcsstr(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the LCSstr.sim method instead.', +) def sim_lcsstr(src, tar): """Return the longest common substring similarity of two strings. @@ -193,10 +249,18 @@ def sim_lcsstr(src, tar): >>> sim_lcsstr('ATCG', 'TAGC') 0.25 + .. versionadded:: 0.1.0 + """ return LCSstr().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the LCSstr.dist method instead.', +) def dist_lcsstr(src, tar): """Return the longest common substring distance between two strings. @@ -225,6 +289,8 @@ def dist_lcsstr(src, tar): >>> dist_lcsstr('ATCG', 'TAGC') 0.75 + .. versionadded:: 0.1.0 + """ return LCSstr().dist(src, tar) diff --git a/abydos/distance/_lcsuffix.py b/abydos/distance/_lcsuffix.py new file mode 100644 index 000000000..d1ebf7475 --- /dev/null +++ b/abydos/distance/_lcsuffix.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._lcsuffix. + +Longest common suffix +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from os.path import commonprefix + +from six import text_type + +from ._lcprefix import LCPrefix + +__all__ = ['LCSuffix'] + + +class LCSuffix(LCPrefix): + """Longest common suffix. + + .. versionadded:: 0.4.0 + """ + + def lcsuffix(self, strings): + """Return the longest common suffix of a list of strings. + + Longest common suffix (LCSuffix). + + Parameters + ---------- + strings : list of strings + Strings for comparison + + Returns + ------- + str + The longest common suffix + + Examples + -------- + >>> sfx = LCSuffix() + >>> sfx.lcsuffix(['cat', 'hat']) + 'at' + >>> sfx.lcsuffix(['Niall', 'Neil']) + 'l' + >>> sfx.lcsuffix(['aluminum', 'Catalan']) + '' + >>> sfx.lcsuffix(['ATCG', 'TAGC']) + '' + + + .. versionadded:: 0.4.0 + + """ + strings = [s[::-1] for s in strings] + return commonprefix(strings)[::-1] + + def dist_abs(self, src, tar, *args): + """Return the length of the longest common suffix of the strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + *args : strs + Additional strings for comparison + + Raises + ------ + ValueError + All arguments must be of type str + + Returns + ------- + int + The length of the longest common suffix + + Examples + -------- + >>> sfx = LCSuffix() + >>> sfx.dist_abs('cat', 'hat') + 2 + >>> sfx.dist_abs('Niall', 'Neil') + 1 + >>> sfx.dist_abs('aluminum', 'Catalan') + 0 + >>> sfx.dist_abs('ATCG', 'TAGC') + 0 + + + .. versionadded:: 0.4.0 + + """ + strings = [text_type(src), text_type(tar)] + for arg in args: + if isinstance(arg, (str, text_type)): + strings.append(text_type(arg)) + else: + raise TypeError('All arguments must be of type str') + + return len(self.lcsuffix(strings)) + + def sim(self, src, tar, *args): + r"""Return the longest common suffix similarity of two or more strings. + + Longest common prefix similarity (:math:`sim_{LCPrefix}`). + + This employs the LCSuffix function to derive a similarity metric: + :math:`sim_{LCSuffix}(s,t) = \frac{|LCSuffix(s,t)|}{max(|s|, |t|)}` + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + *args : strs + Additional strings for comparison + + Returns + ------- + float + LCSuffix similarity + + Examples + -------- + >>> pfx = LCPrefix() + >>> pfx.sim('cat', 'hat') + 0.0 + >>> pfx.sim('Niall', 'Neil') + 0.2 + >>> pfx.sim('aluminum', 'Catalan') + 0.0 + >>> pfx.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + elif not src or not tar: + return 0.0 + dist = self.dist_abs(src, tar, *args) + maxlen = max(len(src), len(tar), *[len(arg) for arg in args]) + return dist / maxlen + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_length.py b/abydos/distance/_length.py index 3e2ca00ac..a0a85fee3 100644 --- a/abydos/distance/_length.py +++ b/abydos/distance/_length.py @@ -28,13 +28,19 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ __all__ = ['Length', 'dist_length', 'sim_length'] class Length(_Distance): - """Length similarity and distance.""" + """Length similarity and distance. + + .. versionadded:: 0.3.6 + """ def sim(self, src, tar): """Return the length similarity of two strings. @@ -66,6 +72,11 @@ def sim(self, src, tar): >>> cmp.sim('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 1.0 @@ -76,6 +87,12 @@ def sim(self, src, tar): ) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Length.sim method instead.', +) def sim_length(src, tar): """Return the length similarity of two strings. @@ -104,10 +121,19 @@ def sim_length(src, tar): >>> sim_length('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.1.0 + """ return Length().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Length.dist method instead.', +) def dist_length(src, tar): """Return the length distance between two strings. @@ -136,6 +162,9 @@ def dist_length(src, tar): >>> dist_length('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + """ return Length().dist(src, tar) diff --git a/abydos/distance/_levenshtein.py b/abydos/distance/_levenshtein.py index 7e8160341..77142238c 100644 --- a/abydos/distance/_levenshtein.py +++ b/abydos/distance/_levenshtein.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -32,13 +32,17 @@ unicode_literals, ) +from sys import float_info -from numpy import int as np_int +from deprecation import deprecated + +from numpy import float as np_float from numpy import zeros as np_zeros from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = ['Levenshtein', 'dist_levenshtein', 'levenshtein', 'sim_levenshtein'] @@ -58,17 +62,24 @@ class Levenshtein(_Distance): Levenshtein edit distance ordinarily has unit insertion, deletion, and substitution costs. + + .. versionadded:: 0.3.6 + .. versionchanged:: 0.4.0 + Added taper option """ - def dist_abs(self, src, tar, mode='lev', cost=(1, 1, 1, 1)): - """Return the Levenshtein distance between two strings. + def __init__( + self, + mode='lev', + cost=(1, 1, 1, 1), + normalizer=max, + taper=False, + **kwargs + ): + """Initialize Levenshtein instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison mode : str Specifies a mode for computing the Levenshtein distance: @@ -83,6 +94,44 @@ def dist_abs(self, src, tar, mode='lev', cost=(1, 1, 1, 1)): A 4-tuple representing the cost of the four possible edits: inserts, deletes, substitutions, and transpositions, respectively (by default: (1, 1, 1, 1)) + normalizer : function + A function that takes an list and computes a normalization term + by which the edit distance is divided (max by default). Another + good option is the sum function. + taper : bool + Enables cost tapering. Following :cite:`Zobel:1996`, it causes + edits at the start of the string to "just [exceed] twice the + minimum penalty for replacement or deletion at the end of the + string". + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Levenshtein, self).__init__(**kwargs) + self._mode = mode + self._cost = cost + self._normalizer = normalizer + self._taper_enabled = taper + + def _taper(self, pos, length): + return ( + round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15) + if self._taper_enabled + else 1 + ) + + def dist_abs(self, src, tar): + """Return the Levenshtein distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -101,37 +150,57 @@ def dist_abs(self, src, tar, mode='lev', cost=(1, 1, 1, 1)): >>> cmp.dist_abs('ATCG', 'TAGC') 3 - >>> cmp.dist_abs('ATCG', 'TAGC', mode='osa') + >>> cmp = Levenshtein(mode='osa') + >>> cmp.dist_abs('ATCG', 'TAGC') 2 - >>> cmp.dist_abs('ACTG', 'TAGC', mode='osa') + >>> cmp.dist_abs('ACTG', 'TAGC') 4 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - ins_cost, del_cost, sub_cost, trans_cost = cost + ins_cost, del_cost, sub_cost, trans_cost = self._cost + + src_len = len(src) + tar_len = len(tar) + max_len = max(src_len, tar_len) if src == tar: return 0 if not src: - return len(tar) * ins_cost + return sum( + ins_cost * self._taper(pos, max_len) for pos in range(tar_len) + ) if not tar: - return len(src) * del_cost - - d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) - for i in range(len(src) + 1): - d_mat[i, 0] = i * del_cost - for j in range(len(tar) + 1): - d_mat[0, j] = j * ins_cost - - for i in range(len(src)): - for j in range(len(tar)): + return sum( + del_cost * self._taper(pos, max_len) for pos in range(src_len) + ) + + d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) + for i in range(src_len + 1): + d_mat[i, 0] = i * self._taper(i, max_len) * del_cost + for j in range(tar_len + 1): + d_mat[0, j] = j * self._taper(j, max_len) * ins_cost + + for i in range(src_len): + for j in range(tar_len): d_mat[i + 1, j + 1] = min( - d_mat[i + 1, j] + ins_cost, # ins - d_mat[i, j + 1] + del_cost, # del + d_mat[i + 1, j] + + ins_cost * self._taper(1 + max(i, j), max_len), # ins + d_mat[i, j + 1] + + del_cost * self._taper(1 + max(i, j), max_len), # del d_mat[i, j] - + (sub_cost if src[i] != tar[j] else 0), # sub/== + + ( + sub_cost * self._taper(1 + max(i, j), max_len) + if src[i] != tar[j] + else 0 + ), # sub/== ) - if mode == 'osa': + if self._mode == 'osa': if ( i + 1 > 1 and j + 1 > 1 @@ -141,12 +210,16 @@ def dist_abs(self, src, tar, mode='lev', cost=(1, 1, 1, 1)): # transposition d_mat[i + 1, j + 1] = min( d_mat[i + 1, j + 1], - d_mat[i - 1, j - 1] + trans_cost, + d_mat[i - 1, j - 1] + + trans_cost * self._taper(1 + max(i, j), max_len), ) - return d_mat[len(src), len(tar)] + if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]: + return int(d_mat[src_len, tar_len]) + else: + return d_mat[src_len, tar_len] - def dist(self, src, tar, mode='lev', cost=(1, 1, 1, 1)): + def dist(self, src, tar): """Return the normalized Levenshtein distance between two strings. The Levenshtein distance is normalized by dividing the Levenshtein @@ -162,20 +235,6 @@ def dist(self, src, tar, mode='lev', cost=(1, 1, 1, 1)): Source string for comparison tar : str Target string for comparison - mode : str - Specifies a mode for computing the Levenshtein distance: - - - ``lev`` (default) computes the ordinary Levenshtein distance, - in which edits may include inserts, deletes, and - substitutions - - ``osa`` computes the Optimal String Alignment distance, in - which edits may include inserts, deletes, substitutions, and - transpositions but substrings may only be edited once - - cost : tuple - A 4-tuple representing the cost of the four possible edits: - inserts, deletes, substitutions, and transpositions, respectively - (by default: (1, 1, 1, 1)) Returns ------- @@ -194,15 +253,46 @@ def dist(self, src, tar, mode='lev', cost=(1, 1, 1, 1)): >>> cmp.dist('ATCG', 'TAGC') 0.75 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0 - ins_cost, del_cost = cost[:2] - return levenshtein(src, tar, mode, cost) / ( - max(len(src) * del_cost, len(tar) * ins_cost) - ) - - + ins_cost, del_cost = self._cost[:2] + + src_len = len(src) + tar_len = len(tar) + + if self._taper_enabled: + normalize_term = self._normalizer( + [ + sum( + self._taper(pos, src_len) * del_cost + for pos in range(src_len) + ), + sum( + self._taper(pos, tar_len) * ins_cost + for pos in range(tar_len) + ), + ] + ) + else: + normalize_term = self._normalizer( + [src_len * del_cost, tar_len * ins_cost] + ) + + return self.dist_abs(src, tar) / normalize_term + + +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Levenshtein.dist_abs method instead.', +) def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)): """Return the Levenshtein distance between two strings. @@ -249,10 +339,18 @@ def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)): >>> levenshtein('ACTG', 'TAGC', mode='osa') 4 + .. versionadded:: 0.1.0 + """ - return Levenshtein().dist_abs(src, tar, mode, cost) + return Levenshtein(mode=mode, cost=cost).dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Levenshtein.dist method instead.', +) def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)): """Return the normalized Levenshtein distance between two strings. @@ -294,10 +392,18 @@ def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)): >>> dist_levenshtein('ATCG', 'TAGC') 0.75 + .. versionadded:: 0.1.0 + """ - return Levenshtein().dist(src, tar, mode, cost) + return Levenshtein(mode=mode, cost=cost).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Levenshtein.sim method instead.', +) def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)): """Return the Levenshtein similarity of two strings. @@ -339,8 +445,10 @@ def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)): >>> sim_levenshtein('ATCG', 'TAGC') 0.25 + .. versionadded:: 0.1.0 + """ - return Levenshtein().sim(src, tar, mode, cost) + return Levenshtein(mode=mode, cost=cost).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_lorentzian.py b/abydos/distance/_lorentzian.py new file mode 100644 index 000000000..bb8daab34 --- /dev/null +++ b/abydos/distance/_lorentzian.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._lorentzian. + +Lorentzian distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log1p + +from ._token_distance import _TokenDistance + +__all__ = ['Lorentzian'] + + +class Lorentzian(_TokenDistance): + r"""Lorentzian distance. + + For two multisets X and Y drawn from an alphabet S, Lorentzian distance is + + .. math:: + + dist_{Lorentzian}(X, Y) = + \sum_{i \in S} log(1 + |A_i - B_i|) + + Notes + ----- + No primary source for this measure could be located, but it is included + in surveys and catalogues, such as :cite:`Deza:2016` and :cite:`Cha:2008`. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Lorentzian instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Lorentzian, self).__init__(tokenizer=tokenizer, **kwargs) + + def dist_abs(self, src, tar): + """Return the Lorentzian distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Lorentzian distance + + Examples + -------- + >>> cmp = Lorentzian() + >>> cmp.dist_abs('cat', 'hat') + 2.772588722239781 + >>> cmp.dist_abs('Niall', 'Neil') + 4.852030263919617 + >>> cmp.dist_abs('aluminum', 'Catalan') + 10.1095256359474 + >>> cmp.dist_abs('ATCG', 'TAGC') + 6.931471805599453 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._total().keys() + + return sum( + log1p(abs(self._src_tokens[tok] - self._tar_tokens[tok])) + for tok in alphabet + ) + + def dist(self, src, tar): + """Return the normalized Lorentzian distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Lorentzian distance + + Examples + -------- + >>> cmp = Lorentzian() + >>> cmp.dist('cat', 'hat') + 0.6666666666666667 + >>> cmp.dist('Niall', 'Neil') + 0.7777777777777778 + >>> cmp.dist('aluminum', 'Catalan') + 0.9358355851062377 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + elif not src or not tar: + return 1.0 + + score = self.dist_abs(src, tar) + + alphabet = self._total().keys() + + return score / sum( + log1p(max(self._src_tokens[tok], self._tar_tokens[tok])) + for tok in alphabet + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_maarel.py b/abydos/distance/_maarel.py new file mode 100644 index 000000000..9af72030f --- /dev/null +++ b/abydos/distance/_maarel.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._maarel. + +Maarel correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Maarel'] + + +class Maarel(_TokenDistance): + r"""Maarel correlation. + + For two sets X and Y and a population N, Maarel correlation + :cite:`Maarel:1969` is + + .. math:: + + corr_{Maarel}(X, Y) = + \frac{2|X \cap Y| - |X \setminus Y| - |Y \setminus X|}{|X| + |Y|} + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Maarel} = + \frac{2a - b - c}{2a + b + c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Maarel instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Maarel, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Maarel correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Maarel correlation + + Examples + -------- + >>> cmp = Maarel() + >>> cmp.corr('cat', 'hat') + 0.0 + >>> cmp.corr('Niall', 'Neil') + -0.2727272727272727 + >>> cmp.corr('aluminum', 'Catalan') + -0.7647058823529411 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return -1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return (2 * a - b - c) / (2 * a + b + c) + + def sim(self, src, tar): + """Return the Maarel similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Maarel similarity + + Examples + -------- + >>> cmp = Maarel() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36363636363636365 + >>> cmp.sim('aluminum', 'Catalan') + 0.11764705882352944 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_manhattan.py b/abydos/distance/_manhattan.py index dff53cb0d..a0e60b3e6 100644 --- a/abydos/distance/_manhattan.py +++ b/abydos/distance/_manhattan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._minkowski import Minkowski +from .. import __version__ __all__ = ['Manhattan', 'dist_manhattan', 'manhattan', 'sim_manhattan'] @@ -38,9 +41,54 @@ class Manhattan(Minkowski): Manhattan distance is the city-block or taxi-cab distance, equivalent to Minkowski distance in :math:`L^1`-space. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, qval=2, normalized=False, alphabet=None): + def __init__( + self, alphabet=0, tokenizer=None, intersection_type='crisp', **kwargs + ): + """Initialize Manhattan instance. + + Parameters + ---------- + alphabet : collection or int + The values or size of the alphabet + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Manhattan, self).__init__( + pval=1, + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist_abs(self, src, tar, normalized=False): """Return the Manhattan distance between two strings. Parameters @@ -49,12 +97,8 @@ def dist_abs(self, src, tar, qval=2, normalized=False, alphabet=None): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version normalized : bool Normalizes to [0, 1] if True - alphabet : collection or int - The values or size of the alphabet Returns ------- @@ -73,12 +117,15 @@ def dist_abs(self, src, tar, qval=2, normalized=False, alphabet=None): >>> cmp.dist_abs('ATCG', 'TAGC') 10.0 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return super(self.__class__, self).dist_abs( - src, tar, qval, 1, normalized, alphabet - ) + return super(Manhattan, self).dist_abs(src, tar, normalized=normalized) - def dist(self, src, tar, qval=2, alphabet=None): + def dist(self, src, tar): """Return the normalized Manhattan distance between two strings. The normalized Manhattan distance is a distance metric in @@ -92,10 +139,6 @@ def dist(self, src, tar, qval=2, alphabet=None): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version - alphabet : collection or int - The values or size of the alphabet Returns ------- @@ -114,10 +157,21 @@ def dist(self, src, tar, qval=2, alphabet=None): >>> cmp.dist('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return self.dist_abs(src, tar, qval, True, alphabet) + return self.dist_abs(src, tar, normalized=True) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Manhattan.dist_abs method instead.', +) def manhattan(src, tar, qval=2, normalized=False, alphabet=None): """Return the Manhattan distance between two strings. @@ -130,7 +184,7 @@ def manhattan(src, tar, qval=2, normalized=False, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram normalized : bool Normalizes to [0, 1] if True alphabet : collection or int @@ -152,11 +206,21 @@ def manhattan(src, tar, qval=2, normalized=False, alphabet=None): >>> manhattan('ATCG', 'TAGC') 10.0 + .. versionadded:: 0.3.0 + """ - return Manhattan().dist_abs(src, tar, qval, normalized, alphabet) + return Manhattan(alphabet=alphabet, qval=qval).dist_abs( + src, tar, normalized=normalized + ) -def dist_manhattan(src, tar, qval=2, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Manhattan.dist method instead.', +) +def dist_manhattan(src, tar, qval=2, alphabet=0): """Return the normalized Manhattan distance between two strings. This is a wrapper for :py:meth:`Manhattan.dist`. @@ -168,7 +232,7 @@ def dist_manhattan(src, tar, qval=2, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram alphabet : collection or int The values or size of the alphabet @@ -188,11 +252,19 @@ def dist_manhattan(src, tar, qval=2, alphabet=None): >>> dist_manhattan('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.3.0 + """ - return Manhattan().dist(src, tar, qval, alphabet) + return Manhattan(alphabet=alphabet, qval=qval).dist(src, tar) -def sim_manhattan(src, tar, qval=2, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Manhattan.sim method instead.', +) +def sim_manhattan(src, tar, qval=2, alphabet=0): """Return the normalized Manhattan similarity of two strings. This is a wrapper for :py:meth:`Manhattan.sim`. @@ -204,7 +276,7 @@ def sim_manhattan(src, tar, qval=2, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram alphabet : collection or int The values or size of the alphabet @@ -224,8 +296,10 @@ def sim_manhattan(src, tar, qval=2, alphabet=None): >>> sim_manhattan('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.3.0 + """ - return Manhattan().sim(src, tar, qval, alphabet) + return Manhattan(alphabet=alphabet, qval=qval).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_marking.py b/abydos/distance/_marking.py new file mode 100644 index 000000000..2473b1a40 --- /dev/null +++ b/abydos/distance/_marking.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._marking. + +Ehrenfeucht & Haussler's marking distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._distance import _Distance + +__all__ = ['Marking'] + + +class Marking(_Distance): + r"""Ehrenfeucht & Haussler's marking distance. + + This edit distance :cite:`Ehrenfeucht:1988` is the number of `marked` + characters in one word that must be masked in order for that word to + consist entirely of substrings of another word. + + It is normalized by the length of the first word. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, **kwargs): + """Initialize Marking instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Marking, self).__init__(**kwargs) + + def dist_abs(self, src, tar): + """Return the marking distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + int + marking distance + + Examples + -------- + >>> cmp = Marking() + >>> cmp.dist_abs('cat', 'hat') + 1 + >>> cmp.dist_abs('Niall', 'Neil') + 3 + >>> cmp.dist_abs('aluminum', 'Catalan') + 5 + >>> cmp.dist_abs('ATCG', 'TAGC') + 2 + >>> cmp.dist_abs('cbaabdcb', 'abcba') + 2 + + + .. versionadded:: 0.4.0 + + """ + distance = 0 + unmatched = src[:] + for i in range(len(unmatched) - 1, -1, -1): + if unmatched[i:] not in tar: + distance += 1 + unmatched = unmatched[:i] + + return distance + + def dist(self, src, tar): + """Return the normalized marking distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + marking distance + + Examples + -------- + >>> cmp = Marking() + >>> cmp.dist('cat', 'hat') + 0.3333333333333333 + >>> cmp.dist('Niall', 'Neil') + 0.6 + >>> cmp.dist('aluminum', 'Catalan') + 0.625 + >>> cmp.dist('ATCG', 'TAGC') + 0.5 + >>> cmp.dist('cbaabdcb', 'abcba') + 0.25 + + + .. versionadded:: 0.4.0 + + """ + score = self.dist_abs(src, tar) + if score: + return score / len(src) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_marking_metric.py b/abydos/distance/_marking_metric.py new file mode 100644 index 000000000..f21b76ad6 --- /dev/null +++ b/abydos/distance/_marking_metric.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._marking_metric. + +Ehrenfeucht & Haussler's marking metric +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._marking import Marking + +__all__ = ['MarkingMetric'] + + +class MarkingMetric(Marking): + r"""Ehrenfeucht & Haussler's marking metric. + + This metric :cite:`Ehrenfeucht:1988` is the base 2 logarithm of the product + of the marking distances between each term plus 1 computed in both orders. + For strings x and y, this is: + + .. math:: + + dist_{MarkingMetric}(x, y) = + log_2((diff(x, y)+1)(diff(y, x)+1)) + + The function diff is Ehrenfeucht & Haussler's marking distance + :class:`Marking`. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, **kwargs): + """Initialize MarkingMetric instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(MarkingMetric, self).__init__(**kwargs) + + def dist_abs(self, src, tar): + """Return the marking distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + int + marking distance + + Examples + -------- + >>> cmp = MarkingMetric() + >>> cmp.dist_abs('cat', 'hat') + 2.0 + >>> cmp.dist_abs('Niall', 'Neil') + 3.5849625007211565 + >>> cmp.dist_abs('aluminum', 'Catalan') + 4.584962500721157 + >>> cmp.dist_abs('ATCG', 'TAGC') + 3.1699250014423126 + >>> cmp.dist_abs('cbaabdcb', 'abcba') + 2.584962500721156 + + + .. versionadded:: 0.4.0 + + """ + diff1 = super(MarkingMetric, self).dist_abs(src, tar) + diff2 = super(MarkingMetric, self).dist_abs(tar, src) + return log((diff1 + 1) * (diff2 + 1), 2) + + def dist(self, src, tar): + """Return the normalized marking distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + marking distance + + Examples + -------- + >>> cmp = Marking() + >>> cmp.dist('cat', 'hat') + 0.3333333333333333 + >>> cmp.dist('Niall', 'Neil') + 0.6 + >>> cmp.dist('aluminum', 'Catalan') + 0.625 + >>> cmp.dist('ATCG', 'TAGC') + 0.5 + >>> cmp.dist('cbaabdcb', 'abcba') + 0.25 + + + .. versionadded:: 0.4.0 + + """ + score = self.dist_abs(src, tar) + if score: + return score / log((len(src) + 1) * (len(tar) + 1), 2) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_masi.py b/abydos/distance/_masi.py new file mode 100644 index 000000000..d20ed9bff --- /dev/null +++ b/abydos/distance/_masi.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._masi. + +MASI similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['MASI'] + + +class MASI(_TokenDistance): + r"""MASI similarity. + + Measuring Agreement on Set-valued Items (MASI) similarity + :cite:`Passonneau:2006` for two sets X and Y is based on Jaccard + similarity: + + .. math:: + + sim_{Jaccard}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} + + This Jaccard similarity is scaled by a value M, which is: + - 1 if :math:`X = Y` + - :math:`\frac{2}{3}` if :math:`X \subset Y` or :math:`Y \subset X` + - :math:`\frac{1}{3}` if :math:`X \cap Y \neq \emptyset`, + :math:`X \setminus Y \neq \emptyset`, and + :math:`Y \setminus X \neq \emptyset` + - 0 if :math:`X \cap Y = \emptyset` + + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize MASI instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(MASI, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the MASI similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + MASI similarity + + Examples + -------- + >>> cmp = MASI() + >>> cmp.sim('cat', 'hat') + 0.1111111111111111 + >>> cmp.sim('Niall', 'Neil') + 0.07407407407407407 + >>> cmp.sim('aluminum', 'Catalan') + 0.020833333333333332 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + abc = self._union_card() + + jaccard = a / abc + if b == 0 or c == 0: + monotonicity = 2 / 3 + elif a != 0: + monotonicity = 1 / 3 + else: + monotonicity = 0.0 + + return jaccard * monotonicity + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_matusita.py b/abydos/distance/_matusita.py new file mode 100644 index 000000000..41d65b9cf --- /dev/null +++ b/abydos/distance/_matusita.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._matusita. + +Matusita distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Matusita'] + + +class Matusita(_TokenDistance): + r"""Matusita distance. + + For two multisets X and Y drawn from an alphabet S, Matusita distance + :cite:`Matusita:1955` is + + .. math:: + + dist_{Matusita}(X, Y) = + \sqrt{\sum_{i \in S} \Bigg(\sqrt{\frac{|A_i|}{|A|}} - + \sqrt{\frac{|B_i|}{|B|}}\Bigg)^2} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Matusita instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Matusita, self).__init__(tokenizer=tokenizer, **kwargs) + + def dist_abs(self, src, tar): + """Return the Matusita distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Matusita distance + + Examples + -------- + >>> cmp = Matusita() + >>> cmp.dist_abs('cat', 'hat') + 1.0 + >>> cmp.dist_abs('Niall', 'Neil') + 1.126811100699571 + >>> cmp.dist_abs('aluminum', 'Catalan') + 1.3282687000770907 + >>> cmp.dist_abs('ATCG', 'TAGC') + 1.414213562373095 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._total().keys() + + src_card = self._src_card() + if src_card == 0: + src_card = 1.0 + tar_card = self._tar_card() + if tar_card == 0: + tar_card = 1.0 + + return ( + sum( + ( + (abs(self._src_tokens[tok]) / src_card) ** 0.5 + - (abs(self._tar_tokens[tok]) / tar_card) ** 0.5 + ) + ** 2 + for tok in alphabet + ) + ) ** 0.5 + + def dist(self, src, tar): + """Return the normalized Matusita distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Matusita distance + + Examples + -------- + >>> cmp = Matusita() + >>> cmp.dist('cat', 'hat') + 0.707106781186547 + >>> cmp.dist('Niall', 'Neil') + 0.796775770420944 + >>> cmp.dist('aluminum', 'Catalan') + 0.939227805062351 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + return round(self.dist_abs(src, tar) / 2 ** 0.5, 15) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_maxwell_pilliner.py b/abydos/distance/_maxwell_pilliner.py new file mode 100644 index 000000000..15ab37efc --- /dev/null +++ b/abydos/distance/_maxwell_pilliner.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._maxwell_pilliner. + +Maxwell & Pilliner correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['MaxwellPilliner'] + + +class MaxwellPilliner(_TokenDistance): + r"""Maxwell & Pilliner correlation. + + For two sets X and Y and a population N, Maxwell & Pilliner correlation + :cite:`Maxwell:1968` is + + .. math:: + + corr_{MaxwellPilliner}(X, Y) = + \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)} + {|X| \cdot |N \setminus X| + |Y| \cdot |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{MaxwellPilliner} = + \frac{2(ad-bc)}{(a+b)(c+d)+(a+c)(b+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize MaxwellPilliner instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(MaxwellPilliner, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Maxwell & Pilliner correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Maxwell & Pilliner correlation + + Examples + -------- + >>> cmp = MaxwellPilliner() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.35921989956790845 + >>> cmp.corr('aluminum', 'Catalan') + 0.10803030303030303 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d - b * c + if num: + return 2 * num / ((a + b) * (c + d) + (a + c) * (b + d)) + return 0.0 + + def sim(self, src, tar): + """Return the Maxwell & Pilliner similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Maxwell & Pilliner similarity + + Examples + -------- + >>> cmp = MaxwellPilliner() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6796099497839543 + >>> cmp.sim('aluminum', 'Catalan') + 0.5540151515151515 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_mcconnaughey.py b/abydos/distance/_mcconnaughey.py new file mode 100644 index 000000000..1b7055a90 --- /dev/null +++ b/abydos/distance/_mcconnaughey.py @@ -0,0 +1,184 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._mcconnaughey. + +McConnaughey correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['McConnaughey'] + + +class McConnaughey(_TokenDistance): + r"""McConnaughey correlation. + + For two sets X and Y, McConnaughey correlation :cite:`McConnaughey:1964` is + + .. math:: + + corr_{McConnaughey}(X, Y) = + \frac{|X \cap Y|^2 - |X \setminus Y| \cdot |Y \setminus X|} + {|X| \cdot |Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{McConnaughey} = + \frac{a^2-bc}{(a+b)(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize McConnaughey instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(McConnaughey, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the McConnaughey correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + McConnaughey correlation + + Examples + -------- + >>> cmp = McConnaughey() + >>> cmp.corr('cat', 'hat') + 0.0 + >>> cmp.corr('Niall', 'Neil') + -0.26666666666666666 + >>> cmp.corr('aluminum', 'Catalan') + -0.7638888888888888 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + num = ( + self._intersection_card() ** 2 + - self._src_only_card() * self._tar_only_card() + ) + + if num: + return num / (self._src_card() * self._tar_card()) + return 0.0 + + def sim(self, src, tar): + """Return the McConnaughey similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + McConnaughey similarity + + Examples + -------- + >>> cmp = McConnaughey() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3666666666666667 + >>> cmp.sim('aluminum', 'Catalan') + 0.11805555555555558 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_mcewen_michael.py b/abydos/distance/_mcewen_michael.py new file mode 100644 index 000000000..1826281bb --- /dev/null +++ b/abydos/distance/_mcewen_michael.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._mcewen_michael. + +McEwen & Michael correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['McEwenMichael'] + + +class McEwenMichael(_TokenDistance): + r"""McEwen & Michael correlation. + + For two sets X and Y and a population N, the McEwen & Michael + correlation :cite:`Michael:1920` is + + .. math:: + + corr_{McEwenMichael}(X, Y) = + \frac{4(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)} + {(|X \cap Y| + |(N \setminus X) \setminus Y|)^2 + + (|X \setminus Y| + |Y \setminus X|)^2} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{McEwenMichael} = + \frac{4(ad-bc)}{(a+d)^2+(b+c)^2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Michael instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(McEwenMichael, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the McEwen & Michael correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Michael correlation + + Examples + -------- + >>> cmp = McEwenMichael() + >>> cmp.corr('cat', 'hat') + 0.010203544942933782 + >>> cmp.corr('Niall', 'Neil') + 0.010189175491654217 + >>> cmp.corr('aluminum', 'Catalan') + 0.0048084299262381456 + >>> cmp.corr('ATCG', 'TAGC') + -0.00016689587032858459 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d - b * c + + if num: + return 4 * num / ((a + d) ** 2 + (b + c) ** 2) + return 0.0 + + def sim(self, src, tar): + """Return the McEwen & Michael similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Michael similarity + + Examples + -------- + >>> cmp = McEwenMichael() + >>> cmp.sim('cat', 'hat') + 0.5051017724714669 + >>> cmp.sim('Niall', 'Neil') + 0.5050945877458272 + >>> cmp.sim('aluminum', 'Catalan') + 0.502404214963119 + >>> cmp.sim('ATCG', 'TAGC') + 0.4999165520648357 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_meta_levenshtein.py b/abydos/distance/_meta_levenshtein.py new file mode 100644 index 000000000..cd9fb0385 --- /dev/null +++ b/abydos/distance/_meta_levenshtein.py @@ -0,0 +1,258 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._meta_levenshtein. + +Meta-Levenshtein distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import Counter +from math import log1p + +from numpy import float as np_float +from numpy import zeros as np_zeros + +from ._distance import _Distance +from ._jaro_winkler import JaroWinkler +from ..corpus import UnigramCorpus +from ..tokenizer import QGrams, WhitespaceTokenizer + +__all__ = ['MetaLevenshtein'] + + +class MetaLevenshtein(_Distance): + r"""Meta-Levenshtein distance. + + Meta-Levenshtein distance :cite:`Moreau:2008` combines Soft-TFIDF with + Levenshtein alignment. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + tokenizer=None, + corpus=None, + metric=None, + normalizer=max, + **kwargs + ): + """Initialize MetaLevenshtein instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + corpus : UnigramCorpus + A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will + be created from the two words when a similarity function is called. + metric : _Distance + A string distance measure class for making soft matches, by default + Jaro-Winkler. + normalizer : function + A function that takes an list and computes a normalization term + by which the edit distance is divided (max by default). Another + good option is the sum function. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(MetaLevenshtein, self).__init__(**kwargs) + self._corpus = corpus + self._metric = metric + self._normalizer = normalizer + + qval = 2 if 'qval' not in self.params else self.params['qval'] + self.params['tokenizer'] = ( + tokenizer + if tokenizer is not None + else WhitespaceTokenizer() + if qval == 0 + else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) + ) + + if self._metric is None: + self._metric = JaroWinkler() + + def dist_abs(self, src, tar): + """Return the Meta-Levenshtein distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Meta-Levenshtein distance + + Examples + -------- + >>> cmp = MetaLevenshtein() + >>> cmp.dist_abs('cat', 'hat') + 0.6155602628882225 + >>> cmp.dist_abs('Niall', 'Neil') + 2.538900657220556 + >>> cmp.dist_abs('aluminum', 'Catalan') + 6.940747163450747 + >>> cmp.dist_abs('ATCG', 'TAGC') + 3.2311205257764453 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + if not src: + return float(len(tar)) + if not tar: + return float(len(src)) + + src_tok = self.params['tokenizer'].tokenize(src) + src_ordered = src_tok.get_list() + src_tok = src_tok.get_counter() + + tar_tok = self.params['tokenizer'].tokenize(tar) + tar_ordered = tar_tok.get_list() + tar_tok = tar_tok.get_counter() + + if self._corpus is None: + corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer']) + corpus.add_document(src) + corpus.add_document(tar) + else: + corpus = self._corpus + + dists = Counter() + s_toks = set(src_tok.keys()) + t_toks = set(tar_tok.keys()) + for s_tok in s_toks: + for t_tok in t_toks: + dists[(s_tok, t_tok)] = ( + self._metric.dist(s_tok, t_tok) if s_tok != t_tok else 0 + ) + + vws_dict = {} + vwt_dict = {} + for token in src_tok.keys(): + vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token) + for token in tar_tok.keys(): + vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token) + + def _dist(s_tok, t_tok): + return dists[(s_tok, t_tok)] * vws_dict[s_tok] * vwt_dict[t_tok] + + d_mat = np_zeros( + (len(src_ordered) + 1, len(tar_ordered) + 1), dtype=np_float + ) + for i in range(len(src_ordered) + 1): + d_mat[i, 0] = i + for j in range(len(tar_ordered) + 1): + d_mat[0, j] = j + + for i in range(len(src_ordered)): + for j in range(len(tar_ordered)): + d_mat[i + 1, j + 1] = min( + d_mat[i + 1, j] + 1, # ins + d_mat[i, j + 1] + 1, # del + d_mat[i, j] + + _dist(src_ordered[i], tar_ordered[j]), # sub/== + ) + + return d_mat[len(src_ordered), len(tar_ordered)] + + def dist(self, src, tar): + """Return the normalized Levenshtein distance between two strings. + + The Levenshtein distance is normalized by dividing the Levenshtein + distance (calculated by any of the three supported methods) by the + greater of the number of characters in src times the cost of a delete + and the number of characters in tar times the cost of an insert. + For the case in which all operations have :math:`cost = 1`, this is + equivalent to the greater of the length of the two strings src & tar. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The normalized Levenshtein distance between src & tar + + Examples + -------- + >>> cmp = MetaLevenshtein() + >>> round(cmp.dist('cat', 'hat'), 12) + 0.205186754296 + >>> round(cmp.dist('Niall', 'Neil'), 12) + 0.507780131444 + >>> cmp.dist('aluminum', 'Catalan') + 0.8675933954313434 + >>> cmp.dist('ATCG', 'TAGC') + 0.8077801314441113 + + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ + if src == tar: + return 0.0 + + return self.dist_abs(src, tar) / ( + self._normalizer( + [ + self.dist_abs(src, ' ' * len(tar)), + self.dist_abs(src, ' ' * len(src)), + ] + ) + if self._corpus + else self._normalizer([len(src), len(tar)]) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_michelet.py b/abydos/distance/_michelet.py new file mode 100644 index 000000000..31b497645 --- /dev/null +++ b/abydos/distance/_michelet.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._michelet. + +Michelet similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Michelet'] + + +class Michelet(_TokenDistance): + r"""Michelet similarity. + + For two sets X and Y and a population N, Michelet similarity + :cite:`Turner:1988` is + + .. math:: + + sim_{Michelet}(X, Y) = + \frac{|X \cap Y|^2}{|X| \cdot |Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Michelet} = + \frac{a^2}{(a+b)(a+c)} + + Following :cite:`SequentiX:2018`, this is termed "Michelet", though + Turner is most often listed as the first author in papers presenting this + measure. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Michelet instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Michelet, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Michelet similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Michelet similarity + + Examples + -------- + >>> cmp = Michelet() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.13333333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.013888888888888888 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + apb = self._src_card() + apc = self._tar_card() + + if not a: + return 0.0 + return a * a / (apb * apc) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_minhash.py b/abydos/distance/_minhash.py new file mode 100644 index 000000000..41df57987 --- /dev/null +++ b/abydos/distance/_minhash.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._minhash. + +MinHash similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from hashlib import sha512 + +import numpy as np + +from ._distance import _Distance +from ..tokenizer import QGrams, WhitespaceTokenizer + +__all__ = ['MinHash'] + + +_MININT = np.iinfo(np.int64).min +_MAXINT = np.iinfo(np.int64).max + + +class MinHash(_Distance): + r"""MinHash similarity. + + MinHash similarity :cite:`Broder:1997` is a method of approximating the + intersection over the union of two sets. This implementation is based on + :cite:`Kula:2015`. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, k=0, seed=10, **kwargs): + """Initialize MinHash instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + k : int + The number of hash functions to use for similarity estimation + seed : int + A seed value for the random functions + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + self._k = k + self._seed = seed + super(MinHash, self).__init__(tokenizer=tokenizer, **kwargs) + + qval = 2 if 'qval' not in self.params else self.params['qval'] + self.params['tokenizer'] = ( + tokenizer + if tokenizer is not None + else WhitespaceTokenizer() + if qval == 0 + else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) + ) + + def sim(self, src, tar): + """Return the MinHash similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + MinHash similarity + + Examples + -------- + >>> cmp = MinHash() + >>> cmp.sim('cat', 'hat') + 0.75 + >>> cmp.sim('Niall', 'Neil') + 1.0 + >>> cmp.sim('aluminum', 'Catalan') + 0.5 + >>> cmp.sim('ATCG', 'TAGC') + 0.6 + + + .. versionadded:: 0.4.0 + + """ + if not src and not tar: + return 1.0 + + src_tokens = self.params['tokenizer'].tokenize(src).get_set() + tar_tokens = self.params['tokenizer'].tokenize(tar).get_set() + + k = self._k if self._k else max(len(src_tokens), len(tar_tokens)) + + masks = np.random.RandomState(seed=self._seed).randint( + _MININT, _MAXINT, k, dtype=np.int64 + ) + + hashes_src = np.full(k, _MAXINT, dtype=np.int64) + hashes_tar = np.full(k, _MAXINT, dtype=np.int64) + + for tok in src_tokens: + hashes_src = np.minimum( + hashes_src, + np.bitwise_xor( + masks, int(sha512(tok.encode()).hexdigest(), 16) + ), + ) + + for tok in tar_tokens: + hashes_tar = np.minimum( + hashes_tar, + np.bitwise_xor( + masks, int(sha512(tok.encode()).hexdigest(), 16) + ), + ) + + return (hashes_src == hashes_tar).sum() / k + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_minkowski.py b/abydos/distance/_minkowski.py index b737717ad..00bc2bde5 100644 --- a/abydos/distance/_minkowski.py +++ b/abydos/distance/_minkowski.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,9 +28,10 @@ unicode_literals, ) -from numbers import Number +from deprecation import deprecated from ._token_distance import _TokenDistance +from .. import __version__ __all__ = ['Minkowski', 'dist_minkowski', 'minkowski', 'sim_minkowski'] @@ -40,11 +41,61 @@ class Minkowski(_TokenDistance): The Minkowski distance :cite:`Minkowski:1910` is a distance metric in :math:`L^p-space`. + + .. versionadded:: 0.3.6 """ - def dist_abs( - self, src, tar, qval=2, pval=1, normalized=False, alphabet=None + def __init__( + self, + pval=1, + alphabet=0, + tokenizer=None, + intersection_type='crisp', + **kwargs ): + """Initialize Euclidean instance. + + Parameters + ---------- + pval : int + The :math:`p`-value of the :math:`L^p`-space + alphabet : collection or int + The values or size of the alphabet + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Minkowski, self).__init__( + tokenizer=tokenizer, + alphabet=alphabet, + intersection_type=intersection_type, + **kwargs + ) + self.set_params(pval=pval) + + def dist_abs(self, src, tar, normalized=False): """Return the Minkowski distance (:math:`L^p`-norm) of two strings. Parameters @@ -53,14 +104,8 @@ def dist_abs( Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version - pval : int or float - The :math:`p`-value of the :math:`L^p`-space normalized : bool Normalizes to [0, 1] if True - alphabet : collection or int - The values or size of the alphabet Returns ------- @@ -79,34 +124,42 @@ def dist_abs( >>> cmp.dist_abs('ATCG', 'TAGC') 10.0 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - q_src, q_tar = self._get_qgrams(src, tar, qval) - diffs = ((q_src - q_tar) + (q_tar - q_src)).values() + self._tokenize(src, tar) + diffs = self._symmetric_difference().values() normalizer = 1 if normalized: - totals = (q_src + q_tar).values() - if alphabet is not None: - # noinspection PyTypeChecker - normalizer = ( - alphabet if isinstance(alphabet, Number) else len(alphabet) - ) - elif pval == 0: + totals = self._total().values() + if self.params['alphabet']: + normalizer = self.params['alphabet'] + elif self.params['pval'] == 0: normalizer = len(totals) else: - normalizer = sum(_ ** pval for _ in totals) ** (1 / pval) + normalizer = sum(_ ** self.params['pval'] for _ in totals) ** ( + 1 / self.params['pval'] + ) if len(diffs) == 0: return 0.0 - if pval == float('inf'): + if self.params['pval'] == float('inf'): # Chebyshev distance return max(diffs) / normalizer - if pval == 0: + if self.params['pval'] == 0: # This is the l_0 "norm" as developed by David Donoho - return len(diffs) / normalizer - return sum(_ ** pval for _ in diffs) ** (1 / pval) / normalizer - - def dist(self, src, tar, qval=2, pval=1, alphabet=None): + return sum(_ != 0 for _ in diffs) / normalizer + return ( + sum(_ ** self.params['pval'] for _ in diffs) + ** (1 / self.params['pval']) + / normalizer + ) + + def dist(self, src, tar): """Return normalized Minkowski distance of two strings. The normalized Minkowski distance :cite:`Minkowski:1910` is a distance @@ -118,12 +171,6 @@ def dist(self, src, tar, qval=2, pval=1, alphabet=None): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version - pval : int or float - The :math:`p`-value of the :math:`L^p`-space - alphabet : collection or int - The values or size of the alphabet Returns ------- @@ -142,11 +189,22 @@ def dist(self, src, tar, qval=2, pval=1, alphabet=None): >>> cmp.dist('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return self.dist_abs(src, tar, qval, pval, True, alphabet) + return self.dist_abs(src, tar, normalized=True) -def minkowski(src, tar, qval=2, pval=1, normalized=False, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Minkowski.dist_abs method instead.', +) +def minkowski(src, tar, qval=2, pval=1, normalized=False, alphabet=0): """Return the Minkowski distance (:math:`L^p`-norm) of two strings. This is a wrapper for :py:meth:`Minkowski.dist_abs`. @@ -158,7 +216,7 @@ def minkowski(src, tar, qval=2, pval=1, normalized=False, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram pval : int or float The :math:`p`-value of the :math:`L^p`-space normalized : bool @@ -182,11 +240,21 @@ def minkowski(src, tar, qval=2, pval=1, normalized=False, alphabet=None): >>> minkowski('ATCG', 'TAGC') 10.0 + .. versionadded:: 0.3.0 + """ - return Minkowski().dist_abs(src, tar, qval, pval, normalized, alphabet) + return Minkowski(pval=pval, alphabet=alphabet, qval=qval).dist_abs( + src, tar, normalized=normalized + ) -def dist_minkowski(src, tar, qval=2, pval=1, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Minkowski.dist method instead.', +) +def dist_minkowski(src, tar, qval=2, pval=1, alphabet=0): """Return normalized Minkowski distance of two strings. This is a wrapper for :py:meth:`Minkowski.dist`. @@ -198,7 +266,7 @@ def dist_minkowski(src, tar, qval=2, pval=1, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram pval : int or float The :math:`p`-value of the :math:`L^p`-space alphabet : collection or int @@ -220,11 +288,21 @@ def dist_minkowski(src, tar, qval=2, pval=1, alphabet=None): >>> dist_minkowski('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.3.0 + """ - return Minkowski().dist(src, tar, qval, pval, alphabet) + return Minkowski(pval=pval, alphabet=alphabet, qval=qval).dist_abs( + src, tar, normalized=True + ) -def sim_minkowski(src, tar, qval=2, pval=1, alphabet=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Minkowski.sim method instead.', +) +def sim_minkowski(src, tar, qval=2, pval=1, alphabet=0): """Return normalized Minkowski similarity of two strings. This is a wrapper for :py:meth:`Minkowski.sim`. @@ -236,7 +314,7 @@ def sim_minkowski(src, tar, qval=2, pval=1, alphabet=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram pval : int or float The :math:`p`-value of the :math:`L^p`-space alphabet : collection or int @@ -258,8 +336,10 @@ def sim_minkowski(src, tar, qval=2, pval=1, alphabet=None): >>> sim_minkowski('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.3.0 + """ - return Minkowski().sim(src, tar, qval, pval, alphabet) + return Minkowski(pval=pval, alphabet=alphabet, qval=qval).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_mlipns.py b/abydos/distance/_mlipns.py index 3c54b6bee..5227f3203 100644 --- a/abydos/distance/_mlipns.py +++ b/abydos/distance/_mlipns.py @@ -28,8 +28,11 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance from ._hamming import Hamming +from .. import __version__ __all__ = ['MLIPNS', 'dist_mlipns', 'sim_mlipns'] @@ -41,25 +44,43 @@ class MLIPNS(_Distance): :cite:`Shannaq:2010`. This function returns only 1.0 (similar) or 0.0 (not similar). LIPNS similarity is identical to normalized Hamming similarity. + + .. versionadded:: 0.3.6 """ - hamming = Hamming() + _hamming = Hamming(diff_lens=True) - def sim(self, src, tar, threshold=0.25, max_mismatches=2): - """Return the MLIPNS similarity of two strings. + def __init__(self, threshold=0.25, max_mismatches=2, **kwargs): + """Initialize MLIPNS instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison threshold : float A number [0, 1] indicating the maximum similarity score, below which the strings are considered 'similar' (0.25 by default) max_mismatches : int A number indicating the allowable number of mismatches to remove before declaring two strings not similar (2 by default) + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(MLIPNS, self).__init__(**kwargs) + self._threshold = threshold + self._max_mismatches = max_mismatches + + def sim(self, src, tar): + """Return the MLIPNS similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -77,6 +98,11 @@ def sim(self, src, tar, threshold=0.25, max_mismatches=2): >>> sim_mlipns('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if tar == src: return 1.0 @@ -84,12 +110,12 @@ def sim(self, src, tar, threshold=0.25, max_mismatches=2): return 0.0 mismatches = 0 - ham = Hamming().dist_abs(src, tar, diff_lens=True) + ham = self._hamming.dist_abs(src, tar) max_length = max(len(src), len(tar)) - while src and tar and mismatches <= max_mismatches: + while src and tar and mismatches <= self._max_mismatches: if ( max_length < 1 - or (1 - (max_length - ham) / max_length) <= threshold + or (1 - (max_length - ham) / max_length) <= self._threshold ): return 1.0 else: @@ -102,6 +128,12 @@ def sim(self, src, tar, threshold=0.25, max_mismatches=2): return 0.0 +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MLIPNS.sim method instead.', +) def sim_mlipns(src, tar, threshold=0.25, max_mismatches=2): """Return the MLIPNS similarity of two strings. @@ -136,10 +168,18 @@ def sim_mlipns(src, tar, threshold=0.25, max_mismatches=2): >>> sim_mlipns('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ - return MLIPNS().sim(src, tar, threshold, max_mismatches) + return MLIPNS(threshold, max_mismatches).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MLIPNS.dist method instead.', +) def dist_mlipns(src, tar, threshold=0.25, max_mismatches=2): """Return the MLIPNS distance between two strings. @@ -174,8 +214,10 @@ def dist_mlipns(src, tar, threshold=0.25, max_mismatches=2): >>> dist_mlipns('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ - return MLIPNS().dist(src, tar, threshold, max_mismatches) + return MLIPNS(threshold, max_mismatches).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_monge_elkan.py b/abydos/distance/_monge_elkan.py index b64cfaa12..baaafa499 100644 --- a/abydos/distance/_monge_elkan.py +++ b/abydos/distance/_monge_elkan.py @@ -28,8 +28,11 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance -from ._levenshtein import sim_levenshtein +from ._levenshtein import Levenshtein, sim_levenshtein +from .. import __version__ from ..tokenizer import QGrams __all__ = ['MongeElkan', 'dist_monge_elkan', 'sim_monge_elkan'] @@ -46,9 +49,35 @@ class MongeElkan(_Distance): calculated, at the cost of doubling the computation time (since :math:`sim_{Monge-Elkan}(src, tar)` and :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged). + + .. versionadded:: 0.3.6 """ - def sim(self, src, tar, sim_func=sim_levenshtein, symmetric=False): + def __init__(self, sim_func=None, symmetric=False, **kwargs): + """Initialize MongeElkan instance. + + Parameters + ---------- + sim_func : function + The internal similarity metric to employ + symmetric : bool + Return a symmetric similarity measure + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(MongeElkan, self).__init__(**kwargs) + self._sim_func = sim_func + if isinstance(self._sim_func, _Distance): + self._sim_func = self._sim_func.sim + elif self._sim_func is None: + self._sim_func = Levenshtein().sim + self._symmetric = symmetric + + def sim(self, src, tar): """Return the Monge-Elkan similarity of two strings. Parameters @@ -57,10 +86,6 @@ def sim(self, src, tar, sim_func=sim_levenshtein, symmetric=False): Source string for comparison tar : str Target string for comparison - sim_func : function - The internal similarity metric to employ - symmetric : bool - Return a symmetric similarity measure Returns ------- @@ -79,12 +104,17 @@ def sim(self, src, tar, sim_func=sim_levenshtein, symmetric=False): >>> cmp.sim('ATCG', 'TAGC') 0.5 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 1.0 - q_src = sorted(QGrams(src).elements()) - q_tar = sorted(QGrams(tar).elements()) + q_src = sorted(QGrams().tokenize(src).get_list()) + q_tar = sorted(QGrams().tokenize(tar).get_list()) if not q_src or not q_tar: return 0.0 @@ -93,16 +123,29 @@ def sim(self, src, tar, sim_func=sim_levenshtein, symmetric=False): for q_s in q_src: max_sim = float('-inf') for q_t in q_tar: - max_sim = max(max_sim, sim_func(q_s, q_t)) + max_sim = max(max_sim, self._sim_func(q_s, q_t)) sum_of_maxes += max_sim sim_em = sum_of_maxes / len(q_src) - if symmetric: - sim_em = (sim_em + self.sim(tar, src, sim_func, False)) / 2 + if self._symmetric: + sum_of_maxes = 0 + for q_t in q_tar: + max_sim = float('-inf') + for q_s in q_src: + max_sim = max(max_sim, self._sim_func(q_t, q_s)) + sum_of_maxes += max_sim + sim_rev = sum_of_maxes / len(q_tar) + sim_em = (sim_em + sim_rev) / 2 return sim_em +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MongeElkan.sim method instead.', +) def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): """Return the Monge-Elkan similarity of two strings. @@ -135,10 +178,18 @@ def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): >>> sim_monge_elkan('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + """ - return MongeElkan().sim(src, tar, sim_func, symmetric) + return MongeElkan(sim_func, symmetric).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MongeElkan.dist method instead.', +) def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): """Return the Monge-Elkan distance between two strings. @@ -171,8 +222,10 @@ def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): >>> dist_monge_elkan('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + """ - return MongeElkan().dist(src, tar, sim_func, symmetric) + return MongeElkan(sim_func, symmetric).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_mountford.py b/abydos/distance/_mountford.py new file mode 100644 index 000000000..2befd5d44 --- /dev/null +++ b/abydos/distance/_mountford.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._mountford. + +Mountford similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Mountford'] + + +class Mountford(_TokenDistance): + r"""Mountford similarity. + + For two sets X and Y, the Mountford similarity :cite:`Mountford:1962` is + + .. math:: + + sim_{Mountford}(X, Y) = + \frac{2|X \cap Y|}{2|X|\cdot|Y|-(|X|+|Y|)\cdot|X \cap Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Mountford} = + \frac{2a}{2(a+b)(a+c)-(2a+b+c)a} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Mountford instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Mountford, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Mountford similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Mountford similarity + + Examples + -------- + >>> cmp = Mountford() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.10526315789473684 + >>> cmp.sim('aluminum', 'Catalan') + 0.015748031496062992 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + if not b: + b = 1 + if not c: + c = 1 + + if a: + return 2.0 * a / (c * (a + 2.0 * b) + a * b) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_mra.py b/abydos/distance/_mra.py index 25db06740..cfcdece9c 100644 --- a/abydos/distance/_mra.py +++ b/abydos/distance/_mra.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._distance import _Distance +from .. import __version__ from ..phonetic import mra __all__ = ['MRA', 'dist_mra', 'mra_compare', 'sim_mra'] @@ -41,6 +44,8 @@ class MRA(_Distance): The Western Airlines Surname Match Rating Algorithm comparison rating, as presented on page 18 of :cite:`Moore:1977`. + + .. versionadded:: 0.3.6 """ def dist_abs(self, src, tar): @@ -70,6 +75,11 @@ def dist_abs(self, src, tar): >>> cmp.dist_abs('ATCG', 'TAGC') 5 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 6 @@ -140,10 +150,21 @@ def sim(self, src, tar): >>> cmp.sim('ATCG', 'TAGC') 0.8333333333333334 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return mra_compare(src, tar) / 6 +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MRA.dist_abs method instead.', +) def mra_compare(src, tar): """Return the MRA comparison rating of two strings. @@ -172,10 +193,18 @@ def mra_compare(src, tar): >>> mra_compare('ATCG', 'TAGC') 5 + .. versionadded:: 0.1.0 + """ return MRA().dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MRA.sim method instead.', +) def sim_mra(src, tar): """Return the normalized MRA similarity of two strings. @@ -204,10 +233,18 @@ def sim_mra(src, tar): >>> sim_mra('ATCG', 'TAGC') 0.8333333333333334 + .. versionadded:: 0.1.0 + """ return MRA().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MRA.dist method instead.', +) def dist_mra(src, tar): """Return the normalized MRA distance between two strings. @@ -236,6 +273,8 @@ def dist_mra(src, tar): >>> dist_mra('ATCG', 'TAGC') 0.16666666666666663 + .. versionadded:: 0.1.0 + """ return MRA().dist(src, tar) diff --git a/abydos/distance/_ms_contingency.py b/abydos/distance/_ms_contingency.py new file mode 100644 index 000000000..0a53d080c --- /dev/null +++ b/abydos/distance/_ms_contingency.py @@ -0,0 +1,206 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._ms_contingency. + +Mean squared contingency correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['MSContingency'] + + +class MSContingency(_TokenDistance): + r"""Mean squared contingency correlation. + + For two sets X and Y and a population N, the mean squared contingency + correlation :cite:`Cole:1949` is + + .. math:: + + corr_{MSContingency}(X, Y) = + \frac{\sqrt{2}(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)} + {\sqrt{(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)^2 + + |X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|}} + + :cite:`Hubalek:1982` and :cite:`Choi:2010` identify this as Cole + similarity. Although Cole discusses this correlation, he does not claim to + have developed it. Rather, he presents his coefficient of interspecific + association as being his own development: :class:`.Cole`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{MSContingency} = + \frac{\sqrt{2}(ad-bc)}{\sqrt{(ad-bc)^2+(a+b)(a+c)(b+d)(c+d)}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize MSContingency instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(MSContingency, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the normalized mean squared contingency corr. of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Mean squared contingency correlation + + Examples + -------- + >>> cmp = MSContingency() + >>> cmp.corr('cat', 'hat') + 0.6298568508557214 + >>> cmp.corr('Niall', 'Neil') + 0.4798371954796814 + >>> cmp.corr('aluminum', 'Catalan') + 0.15214891090821628 + >>> cmp.corr('ATCG', 'TAGC') + -0.009076921903905553 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return -1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + ab = self._src_card() + ac = self._tar_card() + admbc = a * d - b * c + + if admbc: + return ( + 2 ** 0.5 + * admbc + / (admbc ** 2 + ab * ac * (b + d) * (c + d)) ** 0.5 + ) + return 0.0 + + def sim(self, src, tar): + """Return the normalized ms contingency similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Mean squared contingency similarity + + Examples + -------- + >>> cmp = MSContingency() + >>> cmp.sim('cat', 'hat') + 0.8149284254278607 + >>> cmp.sim('Niall', 'Neil') + 0.7399185977398407 + >>> cmp.sim('aluminum', 'Catalan') + 0.5760744554541082 + >>> cmp.sim('ATCG', 'TAGC') + 0.49546153904804724 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_mutual_information.py b/abydos/distance/_mutual_information.py new file mode 100644 index 000000000..46cfb10e1 --- /dev/null +++ b/abydos/distance/_mutual_information.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._mutual_information. + +Mutual Information similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['MutualInformation'] + + +class MutualInformation(_TokenDistance): + r"""Mutual Information similarity. + + For two sets X and Y and a population N, Mutual Information similarity + :cite:`Church:1991` is + + .. math:: + + sim_{MI}(X, Y) = + log_2(\frac{|X \cap Y| \cdot |N|}{|X| \cdot |Y|}) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{MI} = + log_2(\frac{an}{(a+b)(a+c)}) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize MutualInformation instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(MutualInformation, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Mutual Information similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Mutual Information similarity + + Examples + -------- + >>> cmp = MutualInformation() + >>> cmp.sim_score('cat', 'hat') + 6.528166795717758 + >>> cmp.sim_score('Niall', 'Neil') + 5.661433326581222 + >>> cmp.sim_score('aluminum', 'Catalan') + 3.428560943378589 + >>> cmp.sim_score('ATCG', 'TAGC') + -4.700439718141093 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + apb = self._src_card() + apc = self._tar_card() + n = self._population_unique_card() + + return log((1 + a * n) / (1 + apb * apc), 2) + + def sim(self, src, tar): + """Return the normalized Mutual Information similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Mutual Information similarity + + Examples + -------- + >>> cmp = MutualInformation() + >>> cmp.sim('cat', 'hat') + 0.9336092530889809 + >>> cmp.sim('Niall', 'Neil') + 0.891168488172523 + >>> cmp.sim('aluminum', 'Catalan') + 0.7600321183863901 + >>> cmp.sim('ATCG', 'TAGC') + 0.1752299652353853 + + + .. versionadded:: 0.4.0 + + """ + score = self.sim_score(src, tar) + if score: + norm = [ + _ + for _ in [self.sim_score(src, src), self.sim_score(tar, tar)] + if _ != 0.0 + ] + if not norm: + norm = [1] + + return (1.0 + score / max(norm)) / 2.0 + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_ncd_arith.py b/abydos/distance/_ncd_arith.py index 42329c7de..26594d9e4 100644 --- a/abydos/distance/_ncd_arith.py +++ b/abydos/distance/_ncd_arith.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ from ..compression import Arithmetic __all__ = ['NCDarith', 'dist_ncd_arith', 'sim_ncd_arith'] @@ -40,15 +43,29 @@ class NCDarith(_Distance): Cf. https://en.wikipedia.org/wiki/Arithmetic_coding Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. + + .. versionadded:: 0.3.6 """ - _coder = None + def __init__(self, probs=None, **kwargs): + """Initialize the arithmetic coder object. + + Parameters + ---------- + probs : dict + A dictionary trained with :py:meth:`Arithmetic.train` + + + .. versionadded:: 0.3.6 + .. versionchanged:: 0.3.6 + Encapsulated in class - def __init__(self): - """Initialize the arithmetic coder object.""" + """ + super(NCDarith, self).__init__(**kwargs) self._coder = Arithmetic() + self._probs = probs - def dist(self, src, tar, probs=None): + def dist(self, src, tar): """Return the NCD between two strings using arithmetic coding. Parameters @@ -57,8 +74,6 @@ def dist(self, src, tar, probs=None): Source string for comparison tar : str Target string for comparison - probs : dict - A dictionary trained with :py:meth:`Arithmetic.train` Returns ------- @@ -77,15 +92,20 @@ def dist(self, src, tar, probs=None): >>> cmp.dist('ATCG', 'TAGC') 0.6923076923076923 + + .. versionadded:: 0.3.5 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 - if probs is None: + if self._probs is None: # lacking a reasonable dictionary, train on the strings themselves self._coder.train(src + tar) else: - self._coder.set_probs(probs) + self._coder.set_probs(self._probs) src_comp = self._coder.encode(src)[1] tar_comp = self._coder.encode(tar)[1] @@ -97,6 +117,12 @@ def dist(self, src, tar, probs=None): ) / max(src_comp, tar_comp) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDarith.dist method instead.', +) def dist_ncd_arith(src, tar, probs=None): """Return the NCD between two strings using arithmetic coding. @@ -127,10 +153,18 @@ def dist_ncd_arith(src, tar, probs=None): >>> dist_ncd_arith('ATCG', 'TAGC') 0.6923076923076923 + .. versionadded:: 0.3.5 + """ - return NCDarith().dist(src, tar, probs) + return NCDarith(probs).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDarith.sim method instead.', +) def sim_ncd_arith(src, tar, probs=None): """Return the NCD similarity between two strings using arithmetic coding. @@ -161,8 +195,10 @@ def sim_ncd_arith(src, tar, probs=None): >>> sim_ncd_arith('ATCG', 'TAGC') 0.3076923076923077 + .. versionadded:: 0.3.5 + """ - return NCDarith().sim(src, tar, probs) + return NCDarith(probs).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_ncd_bwtrle.py b/abydos/distance/_ncd_bwtrle.py index 1eff4994c..f99fe8a56 100644 --- a/abydos/distance/_ncd_bwtrle.py +++ b/abydos/distance/_ncd_bwtrle.py @@ -28,10 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from ._ncd_rle import NCDrle +from .. import __version__ from ..compression import BWT - __all__ = ['NCDbwtrle', 'dist_ncd_bwtrle', 'sim_ncd_bwtrle'] @@ -41,6 +43,8 @@ class NCDbwtrle(NCDrle): Cf. https://en.wikipedia.org/wiki/Burrows-Wheeler_transform Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. + + .. versionadded:: 0.3.6 """ _bwt = BWT() @@ -72,6 +76,11 @@ def dist(self, src, tar): >>> cmp.dist('ATCG', 'TAGC') 0.8 + + .. versionadded:: 0.3.5 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 @@ -87,6 +96,12 @@ def dist(self, src, tar): ) / max(len(src_comp), len(tar_comp)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDbwtrle.dist method instead.', +) def dist_ncd_bwtrle(src, tar): """Return the NCD between two strings using BWT plus RLE. @@ -115,6 +130,8 @@ def dist_ncd_bwtrle(src, tar): >>> dist_ncd_bwtrle('ATCG', 'TAGC') 0.8 + .. versionadded:: 0.3.5 + """ return NCDbwtrle().dist(src, tar) @@ -147,6 +164,8 @@ def sim_ncd_bwtrle(src, tar): >>> sim_ncd_bwtrle('ATCG', 'TAGC') 0.19999999999999996 + .. versionadded:: 0.3.5 + """ return NCDbwtrle().sim(src, tar) diff --git a/abydos/distance/_ncd_bz2.py b/abydos/distance/_ncd_bz2.py index 6519248c9..f4e4e2fdd 100644 --- a/abydos/distance/_ncd_bz2.py +++ b/abydos/distance/_ncd_bz2.py @@ -30,7 +30,10 @@ import bz2 +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ __all__ = ['NCDbz2', 'dist_ncd_bz2', 'sim_ncd_bz2'] @@ -41,6 +44,8 @@ class NCDbz2(_Distance): Cf. https://en.wikipedia.org/wiki/Bzip2 Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. + + .. versionadded:: 0.3.6 """ _level = 9 @@ -53,6 +58,11 @@ def __init__(self, level=9): level : int The compression level (0 to 9) + + .. versionadded:: 0.3.6 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ self._level = level @@ -83,6 +93,11 @@ def dist(self, src, tar): >>> cmp.dist('ATCG', 'TAGC') 0.03125 + + .. versionadded:: 0.3.5 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 @@ -101,6 +116,12 @@ def dist(self, src, tar): ) / max(len(src_comp), len(tar_comp)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDbz2.dist method instead.', +) def dist_ncd_bz2(src, tar): """Return the NCD between two strings using bzip2 compression. @@ -129,10 +150,18 @@ def dist_ncd_bz2(src, tar): >>> dist_ncd_bz2('ATCG', 'TAGC') 0.03125 + .. versionadded:: 0.3.5 + """ return NCDbz2().dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDbz2.sim method instead.', +) def sim_ncd_bz2(src, tar): """Return the NCD similarity between two strings using bzip2 compression. @@ -161,6 +190,8 @@ def sim_ncd_bz2(src, tar): >>> sim_ncd_bz2('ATCG', 'TAGC') 0.96875 + .. versionadded:: 0.3.5 + """ return NCDbz2().sim(src, tar) diff --git a/abydos/distance/_ncd_lzma.py b/abydos/distance/_ncd_lzma.py index 26ae83664..f0eaa7d91 100644 --- a/abydos/distance/_ncd_lzma.py +++ b/abydos/distance/_ncd_lzma.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ try: import lzma @@ -46,6 +49,8 @@ class NCDlzma(_Distance): Cf. https://en.wikipedia.org/wiki/Lempel-Ziv-Markov_chain_algorithm Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. + + .. versionadded:: 0.3.6 """ def dist(self, src, tar): @@ -80,6 +85,11 @@ def dist(self, src, tar): >>> cmp.dist('ATCG', 'TAGC') 0.08695652173913043 + + .. versionadded:: 0.3.5 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 @@ -103,6 +113,12 @@ def dist(self, src, tar): ) / max(len(src_comp), len(tar_comp)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDlzma.dist method instead.', +) def dist_ncd_lzma(src, tar): """Return the NCD between two strings using LZMA compression. @@ -131,10 +147,18 @@ def dist_ncd_lzma(src, tar): >>> dist_ncd_lzma('ATCG', 'TAGC') 0.08695652173913043 + .. versionadded:: 0.3.5 + """ return NCDlzma().dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDlzma.sim method instead.', +) def sim_ncd_lzma(src, tar): """Return the NCD similarity between two strings using LZMA compression. @@ -163,6 +187,8 @@ def sim_ncd_lzma(src, tar): >>> sim_ncd_lzma('ATCG', 'TAGC') 0.9130434782608696 + .. versionadded:: 0.3.5 + """ return NCDlzma().sim(src, tar) diff --git a/abydos/distance/_ncd_lzss.py b/abydos/distance/_ncd_lzss.py new file mode 100644 index 000000000..838aaffeb --- /dev/null +++ b/abydos/distance/_ncd_lzss.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._ncd_lzss. + +NCD using LZSS +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._distance import _Distance + +try: + import lzss +except ImportError: # pragma: no cover + # If the system lacks the lzss library, that's fine, but LZSS compression + # similarity won't be supported. + lzss = None + +__all__ = ['NCDlzss'] + + +class NCDlzss(_Distance): + """Normalized Compression Distance using LZSS compression. + + Cf. https://en.wikipedia.org/wiki/Lempel-Ziv-Storer-Szymanski + + Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. + + .. versionadded:: 0.4.0 + """ + + def dist(self, src, tar): + """Return the NCD between two strings using LZSS compression. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Compression distance + + Raises + ------ + ValueError + Install the PyLZSS module in order to use LZSS + + Examples + -------- + >>> cmp = NCDlzss() + >>> cmp.dist('cat', 'hat') + 0.75 + >>> cmp.dist('Niall', 'Neil') + 1.0 + >>> cmp.dist('aluminum', 'Catalan') + 1.0 + >>> cmp.dist('ATCG', 'TAGC') + 0.8 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + if lzss is not None: + src_comp = lzss.encode(src) + tar_comp = lzss.encode(tar) + concat_comp = lzss.encode(src + tar) + concat_comp2 = lzss.encode(tar + src) + else: # pragma: no cover + raise ValueError('Install the PyLZSS module in order to use LZSS') + + return ( + min(len(concat_comp), len(concat_comp2)) + - min(len(src_comp), len(tar_comp)) + ) / max(len(src_comp), len(tar_comp)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_ncd_paq9a.py b/abydos/distance/_ncd_paq9a.py new file mode 100644 index 000000000..fa836e8ff --- /dev/null +++ b/abydos/distance/_ncd_paq9a.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._ncd_paq9a. + +NCD using PAQ9A +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._distance import _Distance + +try: + import paq +except ImportError: # pragma: no cover + # If the system lacks the paq9a library, that's fine, but PAQ9A compression + # similarity won't be supported. + paq = None + +__all__ = ['NCDpaq9a'] + + +class NCDpaq9a(_Distance): + """Normalized Compression Distance using PAQ9A compression. + + Cf. http://mattmahoney.net/dc/#paq9a + + Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. + + .. versionadded:: 0.4.0 + """ + + def dist(self, src, tar): + """Return the NCD between two strings using PAQ9A compression. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Compression distance + + Raises + ------ + ValueError + Install the paq module in order to use PAQ9A + + Examples + -------- + >>> cmp = NCDpaq9a() + >>> cmp.dist('cat', 'hat') + 0.42857142857142855 + >>> cmp.dist('Niall', 'Neil') + 0.5555555555555556 + >>> cmp.dist('aluminum', 'Catalan') + 0.5833333333333334 + >>> cmp.dist('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + src = src.encode('utf-8') + tar = tar.encode('utf-8') + + if paq is not None: + src_comp = paq.compress(src) + tar_comp = paq.compress(tar) + concat_comp = paq.compress(src + tar) + concat_comp2 = paq.compress(tar + src) + else: # pragma: no cover + raise ValueError('Install the paq module in order to use PAQ9A') + + # Each string returned by PAQ9A's compressor has 4 header bytes + # followed by a byte of information then 3 null bytes. And it is + # concluded with 3 bytes of \xff. So 4+3+3 invariant bytes are + # subtracted here. + return ( + (min(len(concat_comp), len(concat_comp2)) - 10) + - (min(len(src_comp), len(tar_comp)) - 10) + ) / (max(len(src_comp), len(tar_comp)) - 10) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_ncd_rle.py b/abydos/distance/_ncd_rle.py index f98c4c7d1..d7ae658c0 100644 --- a/abydos/distance/_ncd_rle.py +++ b/abydos/distance/_ncd_rle.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._distance import _Distance +from .. import __version__ from ..compression import RLE __all__ = ['NCDrle', 'dist_ncd_rle', 'sim_ncd_rle'] @@ -40,6 +43,8 @@ class NCDrle(_Distance): Cf. https://en.wikipedia.org/wiki/Run-length_encoding Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. + + .. versionadded:: 0.3.6 """ _rle = RLE() @@ -71,6 +76,11 @@ def dist(self, src, tar): >>> cmp.dist('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.3.5 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 @@ -86,6 +96,12 @@ def dist(self, src, tar): ) / max(len(src_comp), len(tar_comp)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDrle.dist method instead.', +) def dist_ncd_rle(src, tar): """Return the NCD between two strings using RLE. @@ -114,10 +130,18 @@ def dist_ncd_rle(src, tar): >>> dist_ncd_rle('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.3.5 + """ return NCDrle().dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDrle.sim method instead.', +) def sim_ncd_rle(src, tar): """Return the NCD similarity between two strings using RLE. @@ -146,6 +170,8 @@ def sim_ncd_rle(src, tar): >>> sim_ncd_rle('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.3.5 + """ return NCDrle().sim(src, tar) diff --git a/abydos/distance/_ncd_zlib.py b/abydos/distance/_ncd_zlib.py index f91c25f29..a81b36d92 100644 --- a/abydos/distance/_ncd_zlib.py +++ b/abydos/distance/_ncd_zlib.py @@ -30,8 +30,10 @@ import zlib -from ._distance import _Distance +from deprecation import deprecated +from ._distance import _Distance +from .. import __version__ __all__ = ['NCDzlib', 'dist_ncd_zlib', 'sim_ncd_zlib'] @@ -42,9 +44,9 @@ class NCDzlib(_Distance): Cf. https://zlib.net/ Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. - """ - _compressor = None + .. versionadded:: 0.3.6 + """ def __init__(self, level=zlib.Z_DEFAULT_COMPRESSION): """Initialize zlib compressor. @@ -54,8 +56,11 @@ def __init__(self, level=zlib.Z_DEFAULT_COMPRESSION): level : int The compression level (0 to 9) + + .. versionadded:: 0.3.6 + """ - self._compressor = zlib.compressobj(level) + self._level = level def dist(self, src, tar): """Return the NCD between two strings using zlib compression. @@ -84,6 +89,11 @@ def dist(self, src, tar): >>> cmp.dist('ATCG', 'TAGC') 0.4 + + .. versionadded:: 0.3.5 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 @@ -91,21 +101,23 @@ def dist(self, src, tar): src = src.encode('utf-8') tar = tar.encode('utf-8') - self._compressor.compress(src) - src_comp = self._compressor.flush(zlib.Z_FULL_FLUSH) - self._compressor.compress(tar) - tar_comp = self._compressor.flush(zlib.Z_FULL_FLUSH) - self._compressor.compress(src + tar) - concat_comp = self._compressor.flush(zlib.Z_FULL_FLUSH) - self._compressor.compress(tar + src) - concat_comp2 = self._compressor.flush(zlib.Z_FULL_FLUSH) + src_comp = zlib.compress(src, self._level) + tar_comp = zlib.compress(tar, self._level) + concat_comp = zlib.compress(src + tar, self._level) + concat_comp2 = zlib.compress(tar + src, self._level) return ( min(len(concat_comp), len(concat_comp2)) - - min(len(src_comp), len(tar_comp)) - ) / max(len(src_comp), len(tar_comp)) + - (min(len(src_comp), len(tar_comp))) + ) / (max(len(src_comp), len(tar_comp)) - 2) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDzlib.dist method instead.', +) def dist_ncd_zlib(src, tar): """Return the NCD between two strings using zlib compression. @@ -134,10 +146,18 @@ def dist_ncd_zlib(src, tar): >>> dist_ncd_zlib('ATCG', 'TAGC') 0.4 + .. versionadded:: 0.3.5 + """ return NCDzlib().dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NCDzlib.sim method instead.', +) def sim_ncd_zlib(src, tar): """Return the NCD similarity between two strings using zlib compression. @@ -165,6 +185,8 @@ def sim_ncd_zlib(src, tar): >>> sim_ncd_zlib('ATCG', 'TAGC') 0.6 + .. versionadded:: 0.3.5 + """ return NCDzlib().sim(src, tar) diff --git a/abydos/distance/_needleman_wunsch.py b/abydos/distance/_needleman_wunsch.py index 79b2f1701..bf2655bd5 100644 --- a/abydos/distance/_needleman_wunsch.py +++ b/abydos/distance/_needleman_wunsch.py @@ -28,6 +28,8 @@ unicode_literals, ) +from deprecation import deprecated + from numpy import float32 as np_float32 from numpy import zeros as np_zeros @@ -35,6 +37,7 @@ from ._distance import _Distance from ._ident import sim_ident +from .. import __version__ __all__ = ['NeedlemanWunsch', 'needleman_wunsch'] @@ -44,6 +47,9 @@ class NeedlemanWunsch(_Distance): The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit distance measure. + + + .. versionadded:: 0.3.6 """ @staticmethod @@ -106,6 +112,11 @@ def sim_matrix( >>> NeedlemanWunsch.sim_matrix('hat', 'hat') 1 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if alphabet: alphabet = tuple(alphabet) @@ -126,20 +137,38 @@ def sim_matrix( return mat[(tar, src)] return mismatch_cost - def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident): - """Return the Needleman-Wunsch score of two strings. + def __init__(self, gap_cost=1, sim_func=None, **kwargs): + """Initialize NeedlemanWunsch instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison gap_cost : float The cost of an alignment gap (1 by default) sim_func : function A function that returns the similarity of two characters (identity similarity by default) + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(NeedlemanWunsch, self).__init__(**kwargs) + self._gap_cost = gap_cost + self._sim_func = sim_func + if self._sim_func is None: + self._sim_func = NeedlemanWunsch.sim_matrix + + def dist_abs(self, src, tar): + """Return the Needleman-Wunsch score of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -158,22 +187,35 @@ def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident): >>> cmp.dist_abs('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) for i in range(len(src) + 1): - d_mat[i, 0] = -(i * gap_cost) + d_mat[i, 0] = -(i * self._gap_cost) for j in range(len(tar) + 1): - d_mat[0, j] = -(j * gap_cost) + d_mat[0, j] = -(j * self._gap_cost) for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): - match = d_mat[i - 1, j - 1] + sim_func(src[i - 1], tar[j - 1]) - delete = d_mat[i - 1, j] - gap_cost - insert = d_mat[i, j - 1] - gap_cost + match = d_mat[i - 1, j - 1] + self._sim_func( + src[i - 1], tar[j - 1] + ) + delete = d_mat[i - 1, j] - self._gap_cost + insert = d_mat[i, j - 1] - self._gap_cost d_mat[i, j] = max(match, delete, insert) return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NeedlemanWunsch.dist_abs method instead.', +) def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident): """Return the Needleman-Wunsch score of two strings. @@ -207,8 +249,11 @@ def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident): >>> needleman_wunsch('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + """ - return NeedlemanWunsch().dist_abs(src, tar, gap_cost, sim_func) + return NeedlemanWunsch(gap_cost, sim_func).dist_abs(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_overlap.py b/abydos/distance/_overlap.py index 4edbb3375..27779c237 100644 --- a/abydos/distance/_overlap.py +++ b/abydos/distance/_overlap.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._token_distance import _TokenDistance +from .. import __version__ __all__ = ['Overlap', 'dist_overlap', 'sim_overlap'] @@ -38,11 +41,59 @@ class Overlap(_TokenDistance): For two sets X and Y, the overlap coefficient :cite:`Szymkiewicz:1934,Simpson:1949`, also called the - Szymkiewicz-Simpson coefficient, is - :math:`sim_{overlap}(X, Y) = \frac{|X \cap Y|}{min(|X|, |Y|)}`. + Szymkiewicz-Simpson coefficient and Simpson's ecological coexistence + coefficient, is + + .. math:: + + sim_{overlap}(X, Y) = \frac{|X \cap Y|}{min(|X|, |Y|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{overlap} = \frac{a}{min(a+b, a+c)} + + .. versionadded:: 0.3.6 """ - def sim(self, src, tar, qval=2): + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize Overlap instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Overlap, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): r"""Return the overlap coefficient of two strings. Parameters @@ -51,8 +102,6 @@ def sim(self, src, tar, qval=2): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version Returns ------- @@ -71,20 +120,31 @@ def sim(self, src, tar, qval=2): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 1.0 - elif not src or not tar: - return 0.0 - q_src, q_tar = self._get_qgrams(src, tar, qval) - q_src_mag = sum(q_src.values()) - q_tar_mag = sum(q_tar.values()) - q_intersection_mag = sum((q_src & q_tar).values()) + self._tokenize(src, tar) - return q_intersection_mag / min(q_src_mag, q_tar_mag) + if not self._src_card() or not self._tar_card(): + return 0.0 + + return self._intersection_card() / min( + self._src_card(), self._tar_card() + ) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Overlap.sim method instead.', +) def sim_overlap(src, tar, qval=2): r"""Return the overlap coefficient of two strings. @@ -97,7 +157,7 @@ def sim_overlap(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -115,10 +175,18 @@ def sim_overlap(src, tar, qval=2): >>> sim_overlap('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ - return Overlap().sim(src, tar, qval) + return Overlap(qval=qval).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Overlap.dist method instead.', +) def dist_overlap(src, tar, qval=2): """Return the overlap distance between two strings. @@ -131,7 +199,7 @@ def dist_overlap(src, tar, qval=2): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram Returns ------- @@ -149,8 +217,10 @@ def dist_overlap(src, tar, qval=2): >>> dist_overlap('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ - return Overlap().dist(src, tar, qval) + return Overlap(qval=qval).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_ozbay.py b/abydos/distance/_ozbay.py new file mode 100644 index 000000000..5a680ab40 --- /dev/null +++ b/abydos/distance/_ozbay.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._ozbay. + +Ozbay metric +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from . import Jaccard, LCSstr, Levenshtein +from ._distance import _Distance +from ..tokenizer import QGrams + +__all__ = ['Ozbay'] + + +class Ozbay(_Distance): + """Ozbay metric. + + The Ozbay metric :cite:`Ozbay:2015` is a string distance measure developed + by Hakan Ozbay, which combines Jaccard distance, Levenshtein distance, and + longest common substring distance. + + The normalized variant should be considered experimental. + + .. versionadded:: 0.4.0 + + """ + + _lev = Levenshtein() + _jac = Jaccard(tokenizer=QGrams(qval=1, start_stop='', scaler='set')) + _lcs = LCSstr() + + def dist_abs(self, src, tar): + """Return the Ozbay metric. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Ozbay metric + + Examples + -------- + >>> cmp = Ozbay() + >>> round(cmp.dist_abs('cat', 'hat'), 12) + 0.75 + >>> round(cmp.dist_abs('Niall', 'Neil'), 12) + 6.0 + >>> round(cmp.dist_abs('Colin', 'Cuilen'), 12) + 7.714285714286 + >>> cmp.dist_abs('ATCG', 'TAGC') + 3.0 + + + .. versionadded:: 0.4.0 + + """ + lev_dist = self._lev.dist_abs(src, tar) + lev_metric = 0.0 if lev_dist == 0 else lev_dist / len(src) + jac_metric = self._jac.dist_abs(src, tar) + lcs_metric = self._lcs.sim(src, tar) + + if jac_metric == 1.0: + ozbay_metric = lev_dist + elif jac_metric == 0.0: + ozbay_metric = lev_metric + else: + ozbay_metric = jac_metric * lev_dist + + if lcs_metric > 0.0: + ozbay_metric /= lcs_metric + else: + ozbay_metric *= min(len(src), len(tar)) + + return ozbay_metric + + def dist(self, src, tar): + """Return the normalized Ozbay distance. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Normalized Ozbay distance + + Examples + -------- + >>> cmp = Ozbay() + >>> round(cmp.dist('cat', 'hat'), 12) + 0.027777777778 + >>> round(cmp.dist('Niall', 'Neil'), 12) + 0.24 + >>> round(cmp.dist('Colin', 'Cuilen'), 12) + 0.214285714286 + >>> cmp.dist('ATCG', 'TAGC') + 0.140625 + + + .. versionadded:: 0.4.0 + + """ + dist = self.dist_abs(src, tar) + if dist: + return dist / (len(src) * len(tar) / self._lcs.dist(src, tar)) + return dist + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_pattern.py b/abydos/distance/_pattern.py new file mode 100644 index 000000000..0fcbfa059 --- /dev/null +++ b/abydos/distance/_pattern.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._pattern. + +Pattern difference +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Pattern'] + + +class Pattern(_TokenDistance): + r"""Pattern difference. + + For two sets X and Y and a population N, the pattern difference + :cite:`Batagelj:1995`, Batagelj & Bren's :math:`- bc -` is + + .. math:: + + dist_{pattern}(X, Y) = + \frac{4 \cdot |X \setminus Y| \cdot |Y \setminus X|} + {|N|^2} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{pattern} = + \frac{4bc}{n^2} + + In :cite:`IBM:2017`, the formula omits the 4 in the numerator: + :math:`\frac{bc}{n^2}`. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Pattern instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Pattern, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Pattern difference of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pattern difference + + Examples + -------- + >>> cmp = Pattern() + >>> cmp.dist('cat', 'hat') + 2.6030820491461892e-05 + >>> cmp.dist('Niall', 'Neil') + 7.809246147438568e-05 + >>> cmp.dist('aluminum', 'Catalan') + 0.0003635035904093472 + >>> cmp.dist('ATCG', 'TAGC') + 0.0001626926280716368 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + num = b * c + if num: + return 4 * b * c / n ** 2 + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_pearson_chi_squared.py b/abydos/distance/_pearson_chi_squared.py new file mode 100644 index 000000000..13d819e4b --- /dev/null +++ b/abydos/distance/_pearson_chi_squared.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._pearson_chi_squared. + +Pearson's Chi-Squared similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import copysign + +from ._token_distance import _TokenDistance + +__all__ = ['PearsonChiSquared'] + + +class PearsonChiSquared(_TokenDistance): + r"""Pearson's Chi-Squared similarity. + + For two sets X and Y and a population N, the Pearson's :math:`\chi^2` + similarity :cite:`Pearson:1913` is + + .. math:: + + sim_{PearsonChiSquared}(X, Y) = + \frac{|N| \cdot (|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)^2} + {|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|} + + This is also Pearson I similarity. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{PearsonChiSquared} = + \frac{n(ad-bc)^2}{(a+b)(a+c)(b+d)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize PearsonChiSquared instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(PearsonChiSquared, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return Pearson's Chi-Squared similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson's Chi-Squared similarity + + Examples + -------- + >>> cmp = PearsonChiSquared() + >>> cmp.sim_score('cat', 'hat') + 193.99489809335964 + >>> cmp.sim_score('Niall', 'Neil') + 101.99771068526542 + >>> cmp.sim_score('aluminum', 'Catalan') + 9.19249664336649 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.032298410951138765 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + ab = self._src_card() + ac = self._tar_card() + + if src == tar: + return float(n) + if not src or not tar: + return 0.0 + num = n * (a * d - b * c) ** 2 + if num: + return num / (ab * ac * (b + d) * (c + d)) + return 0.0 + + def corr(self, src, tar): + """Return Pearson's Chi-Squared correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson's Chi-Squared correlation + + Examples + -------- + >>> cmp = PearsonChiSquared() + >>> cmp.corr('cat', 'hat') + 0.2474424720578567 + >>> cmp.corr('Niall', 'Neil') + 0.1300991207720222 + >>> cmp.corr('aluminum', 'Catalan') + 0.011710186806836291 + >>> cmp.corr('ATCG', 'TAGC') + -4.1196952743799446e-05 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + score = self.sim_score(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + score /= a + b + c + d + + return copysign(score, a * d - b * c) + + def sim(self, src, tar): + """Return Pearson's normalized Chi-Squared similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Pearson's Chi-Squared similarity + + Examples + -------- + >>> cmp = PearsonChiSquared() + >>> cmp.corr('cat', 'hat') + 0.2474424720578567 + >>> cmp.corr('Niall', 'Neil') + 0.1300991207720222 + >>> cmp.corr('aluminum', 'Catalan') + 0.011710186806836291 + >>> cmp.corr('ATCG', 'TAGC') + -4.1196952743799446e-05 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_pearson_heron_ii.py b/abydos/distance/_pearson_heron_ii.py new file mode 100644 index 000000000..179ce801b --- /dev/null +++ b/abydos/distance/_pearson_heron_ii.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._pearson_heron_ii. + +Pearson & Heron II correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import cos, pi + +from ._token_distance import _TokenDistance + +__all__ = ['PearsonHeronII'] + + +class PearsonHeronII(_TokenDistance): + r"""Pearson & Heron II correlation. + + For two sets X and Y and a population N, Pearson & Heron II correlation + :cite:`Pearson:1913` is + + .. math:: + + corr_{PearsonHeronII}(X, Y) = + \cos \Big(\frac{\pi\sqrt{|X \setminus Y| \cdot |Y \setminus X|}} + {\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + \sqrt{|X \setminus Y| \cdot |Y \setminus X|}}\Big) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{PearsonHeronII} = + \cos \Big(\frac{\pi\sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}\Big) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize PearsonHeronII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(PearsonHeronII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Pearson & Heron II correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson & Heron II correlation + + Examples + -------- + >>> cmp = PearsonHeronII() + >>> cmp.corr('cat', 'hat') + 0.9885309061036239 + >>> cmp.corr('Niall', 'Neil') + 0.9678978997263907 + >>> cmp.corr('aluminum', 'Catalan') + 0.7853000893691571 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return -1.0 + + self._tokenize(src, tar) + + root_ad = ( + self._intersection_card() * self._total_complement_card() + ) ** 0.5 + root_bc = (self._src_only_card() * self._tar_only_card()) ** 0.5 + + num = pi * root_bc + return cos((num / (root_ad + root_bc)) if num else 0.0) + + def sim(self, src, tar): + """Return the Pearson & Heron II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson & Heron II similarity + + Examples + -------- + >>> cmp = PearsonHeronII() + >>> cmp.sim('cat', 'hat') + 0.994265453051812 + >>> cmp.sim('Niall', 'Neil') + 0.9839489498631954 + >>> cmp.sim('aluminum', 'Catalan') + 0.8926500446845785 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_pearson_ii.py b/abydos/distance/_pearson_ii.py new file mode 100644 index 000000000..f16b44f7d --- /dev/null +++ b/abydos/distance/_pearson_ii.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._pearson_ii. + +Pearson II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._pearson_chi_squared import PearsonChiSquared + +__all__ = ['PearsonII'] + + +class PearsonII(PearsonChiSquared): + r"""Pearson II similarity. + + For two sets X and Y and a population N, the Pearson II + similarity :cite:`Pearson:1913`, Pearson's coefficient of mean square + contingency, is + + .. math:: + + corr_{PearsonII} = \sqrt{\frac{\chi^2}{|N|+\chi^2}} + + where + + .. math:: + + \chi^2 = sim_{PearsonChiSquared}(X, Y) = + \frac{|N| \cdot (|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)^2} + {|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + \chi^2 = sim_{PearsonChiSquared} = + \frac{n \cdot (ad-bc)^2}{(a+b)(a+c)(b+d)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize PearsonII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(PearsonII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Pearson II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson II similarity + + Examples + -------- + >>> cmp = PearsonII() + >>> cmp.sim_score('cat', 'hat') + 0.44537605041688455 + >>> cmp.sim_score('Niall', 'Neil') + 0.3392961347892176 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.10758552665334761 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.006418353030552324 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 2 ** 0.5 / 2 + chi2 = super(PearsonII, self).sim_score(src, tar) + return (chi2 / (self._population_unique_card() + chi2)) ** 0.5 + + def sim(self, src, tar): + """Return the normalized Pearson II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Pearson II similarity + + Examples + -------- + >>> cmp = PearsonII() + >>> cmp.sim('cat', 'hat') + 0.6298568508557214 + >>> cmp.sim('Niall', 'Neil') + 0.47983719547968123 + >>> cmp.sim('aluminum', 'Catalan') + 0.15214891090821628 + >>> cmp.sim('ATCG', 'TAGC') + 0.009076921903905551 + + + .. versionadded:: 0.4.0 + + """ + return self.sim_score(src, tar) * 2 / 2 ** 0.5 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_pearson_iii.py b/abydos/distance/_pearson_iii.py new file mode 100644 index 000000000..dad9b35f8 --- /dev/null +++ b/abydos/distance/_pearson_iii.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._pearson_iii. + +Pearson III correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import copysign + +from ._pearson_phi import PearsonPhi + +__all__ = ['PearsonIII'] + + +class PearsonIII(PearsonPhi): + r"""Pearson III correlation. + + For two sets X and Y and a population N, the Pearson III + correlation :cite:`Pearson:1913`, Pearson's coefficient of racial likeness, + is + + .. math:: + + corr_{PearsonIII} = \sqrt{\frac{\phi}{|N|+\phi}} + + where + + .. math:: + + \phi = corr_{PearsonPhi}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {\sqrt{|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + \phi = corr_{PearsonPhi} = + \frac{ad-bc} + {\sqrt{(a+b)(a+c)(b+c)(b+d)}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize PearsonIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(PearsonIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Pearson III correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson III correlation + + Examples + -------- + >>> cmp = PearsonIII() + >>> cmp.corr('cat', 'hat') + 0.025180989806958435 + >>> cmp.corr('Niall', 'Neil') + 0.021444241017487504 + >>> cmp.corr('aluminum', 'Catalan') + 0.011740218922356615 + >>> cmp.corr('ATCG', 'TAGC') + -0.0028612777635371113 + + + .. versionadded:: 0.4.0 + + """ + phi = super(PearsonIII, self).corr(src, tar) + return copysign( + (abs(phi) / (self._population_unique_card() + phi)) ** 0.5, phi + ) + + def sim(self, src, tar): + """Return the Pearson III similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson III similarity + + Examples + -------- + >>> cmp = PearsonIII() + >>> cmp.sim('cat', 'hat') + 0.5125904949034792 + >>> cmp.sim('Niall', 'Neil') + 0.5107221205087438 + >>> cmp.sim('aluminum', 'Catalan') + 0.5058701094611783 + >>> cmp.sim('ATCG', 'TAGC') + 0.49856936111823147 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_pearson_phi.py b/abydos/distance/_pearson_phi.py new file mode 100644 index 000000000..b65cb1862 --- /dev/null +++ b/abydos/distance/_pearson_phi.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._pearson_phi. + +Pearson's Phi correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['PearsonPhi'] + + +class PearsonPhi(_TokenDistance): + r"""Pearson's Phi correlation. + + For two sets X and Y and a population N, the Pearson's :math:`\phi` + correlation :cite:`Pearson:1900,Pearson:1913,Guilford:1956` is + + .. math:: + + corr_{PearsonPhi}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {\sqrt{|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|}} + + This is also Pearson & Heron I similarity. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{PearsonPhi} = + \frac{ad-bc} + {\sqrt{(a+b)(a+c)(b+d)(c+d)}} + + Notes + ----- + In terms of a confusion matrix, this is equivalent to the Matthews + correlation coefficient :py:meth:`ConfusionTable.mcc`. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize PearsonPhi instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(PearsonPhi, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return Pearson's Phi correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson's Phi correlation + + Examples + -------- + >>> cmp = PearsonPhi() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.36069255713421955 + >>> cmp.corr('aluminum', 'Catalan') + 0.10821361655002706 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + if src == tar: + return 1.0 + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + ab = self._src_card() + ac = self._tar_card() + + num = a * d - b * c + if num: + return num / (ab * ac * (b + d) * (c + d)) ** 0.5 + return 0.0 + + def sim(self, src, tar): + """Return the normalized Pearson's Phi similarity of two strings. + + This is normalized to [0, 1] by adding 1 and dividing by 2. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Pearson's Phi similarity + + Examples + -------- + >>> cmp = PearsonPhi() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6803462785671097 + >>> cmp.sim('aluminum', 'Catalan') + 0.5541068082750136 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (self.corr(src, tar) + 1.0) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_peirce.py b/abydos/distance/_peirce.py new file mode 100644 index 000000000..bdb39ca41 --- /dev/null +++ b/abydos/distance/_peirce.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._peirce. + +Peirce correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Peirce'] + + +class Peirce(_TokenDistance): + r"""Peirce correlation. + + For two sets X and Y and a population N, the Peirce correlation + :cite:`Peirce:1884` is + + .. math:: + + corr_{Peirce}(X, Y) = \frac{|X \cap Y| \cdot + |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus Y|} + {|X| \cdot |N \setminus X|} + + Both :cite:`Choi:2010` and :cite:`Hubalek:1982` present a different formula + and incorrectly attribute it to Peirce. Likewise, :cite:`Doolittle:1884` + presents a different formula and incorrectly attributes it to Peirce. This + is distinct from the formula he presents and attributes to himself. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Peirce} = + \frac{ad-bc}{(a+b)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Peirce instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Peirce, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Peirce correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Peirce correlation + + Examples + -------- + >>> cmp = Peirce() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.32947729220222793 + >>> cmp.corr('aluminum', 'Catalan') + 0.10209049255441008 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d - b * c + if num: + return num / ((a + b) * (c + d)) + return 0.0 + + def sim(self, src, tar): + """Return the Peirce similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Peirce similarity + + Examples + -------- + >>> cmp = Peirce() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.664738646101114 + >>> cmp.sim('aluminum', 'Catalan') + 0.5510452462772051 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_positional_q_gram_dice.py b/abydos/distance/_positional_q_gram_dice.py new file mode 100644 index 000000000..6d6cdc6f1 --- /dev/null +++ b/abydos/distance/_positional_q_gram_dice.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._positional_q_gram_dice. + +Positional Q-Gram Dice coefficient +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import defaultdict + +from ._distance import _Distance +from ..tokenizer import QGrams, WhitespaceTokenizer + +__all__ = ['PositionalQGramDice'] + + +class PositionalQGramDice(_Distance): + r"""Positional Q-Gram Dice coefficient. + + Positional Q-Gram Dice coefficient :cite:`Gravano:2001,Christen:2006` + + .. versionadded:: 0.4.0 + """ + + def __init__(self, max_dist=1, tokenizer=None, **kwargs): + """Initialize PositionalQGramDice instance. + + Parameters + ---------- + max_dist : int + The maximum positional distance between to q-grams to count as a + match. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(PositionalQGramDice, self).__init__( + tokenizer=tokenizer, **kwargs + ) + self._max_dist = max_dist + + qval = 2 if 'qval' not in self.params else self.params['qval'] + self.params['tokenizer'] = ( + tokenizer + if tokenizer is not None + else WhitespaceTokenizer() + if qval == 0 + else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) + ) + + def sim(self, src, tar): + """Return the Positional Q-Gram Dice coefficient of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Positional Q-Gram Dice coefficient + + Examples + -------- + >>> cmp = PositionalQGramDice() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36363636363636365 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + src_list = self.params['tokenizer'].tokenize(src).get_list() + tar_list = self.params['tokenizer'].tokenize(tar).get_list() + + src_pos = defaultdict(list) + tar_pos = defaultdict(list) + + intersection = 0 + + for pos in range(len(src_list)): + src_pos[src_list[pos]].append(pos) + for pos in range(len(tar_list)): + tar_pos[tar_list[pos]].append(pos) + + src_matched = [] + tar_matched = [] + + for tok in src_pos: + if tok in tar_pos: + for sp in src_pos[tok]: + for tp in tar_pos[tok]: + if ( + abs(sp - tp) <= self._max_dist + and sp not in src_matched + and tp not in tar_matched + ): + intersection += 1 + src_matched.append(sp) + tar_matched.append(tp) + + denom = len(src_list) + len(tar_list) + + return 2 * intersection / denom + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_positional_q_gram_jaccard.py b/abydos/distance/_positional_q_gram_jaccard.py new file mode 100644 index 000000000..9f4f62f3c --- /dev/null +++ b/abydos/distance/_positional_q_gram_jaccard.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._positional_q_gram_jaccard. + +Positional Q-Gram Jaccard coefficient +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import defaultdict + +from ._distance import _Distance +from ..tokenizer import QGrams, WhitespaceTokenizer + +__all__ = ['PositionalQGramJaccard'] + + +class PositionalQGramJaccard(_Distance): + r"""Positional Q-Gram Jaccard coefficient. + + Positional Q-Gram Jaccard coefficient :cite:`Gravano:2001,Christen:2006` + + .. versionadded:: 0.4.0 + """ + + def __init__(self, max_dist=1, tokenizer=None, **kwargs): + """Initialize PositionalQGramJaccard instance. + + Parameters + ---------- + max_dist : int + The maximum positional distance between to q-grams to count as a + match. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(PositionalQGramJaccard, self).__init__( + tokenizer=tokenizer, **kwargs + ) + self._max_dist = max_dist + + qval = 2 if 'qval' not in self.params else self.params['qval'] + self.params['tokenizer'] = ( + tokenizer + if tokenizer is not None + else WhitespaceTokenizer() + if qval == 0 + else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) + ) + + def sim(self, src, tar): + """Return the Positional Q-Gram Jaccard coefficient of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Positional Q-Gram Jaccard coefficient + + Examples + -------- + >>> cmp = PositionalQGramJaccard() + >>> cmp.sim('cat', 'hat') + 0.3333333333333333 + >>> cmp.sim('Niall', 'Neil') + 0.2222222222222222 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + src_list = self.params['tokenizer'].tokenize(src).get_list() + tar_list = self.params['tokenizer'].tokenize(tar).get_list() + + src_pos = defaultdict(list) + tar_pos = defaultdict(list) + + intersection = 0 + + for pos in range(len(src_list)): + src_pos[src_list[pos]].append(pos) + for pos in range(len(tar_list)): + tar_pos[tar_list[pos]].append(pos) + + src_matched = [] + tar_matched = [] + + for tok in src_pos: + if tok in tar_pos: + for sp in src_pos[tok]: + for tp in tar_pos[tok]: + if ( + abs(sp - tp) <= self._max_dist + and sp not in src_matched + and tp not in tar_matched + ): + intersection += 1 + src_matched.append(sp) + tar_matched.append(tp) + + union = len(src_list) + len(tar_list) - len(src_matched) + + return intersection / union + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_positional_q_gram_overlap.py b/abydos/distance/_positional_q_gram_overlap.py new file mode 100644 index 000000000..21345ca60 --- /dev/null +++ b/abydos/distance/_positional_q_gram_overlap.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._positional_q_gram_overlap. + +Positional Q-Gram Overlap coefficient +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import defaultdict + +from ._distance import _Distance +from ..tokenizer import QGrams, WhitespaceTokenizer + +__all__ = ['PositionalQGramOverlap'] + + +class PositionalQGramOverlap(_Distance): + r"""Positional Q-Gram Overlap coefficient. + + Positional Q-Gram Overlap coefficient :cite:`Gravano:2001,Christen:2006` + + .. versionadded:: 0.4.0 + """ + + def __init__(self, max_dist=1, tokenizer=None, **kwargs): + """Initialize PositionalQGramOverlap instance. + + Parameters + ---------- + max_dist : int + The maximum positional distance between to q-grams to count as a + match. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(PositionalQGramOverlap, self).__init__( + tokenizer=tokenizer, **kwargs + ) + self._max_dist = max_dist + + qval = 2 if 'qval' not in self.params else self.params['qval'] + self.params['tokenizer'] = ( + tokenizer + if tokenizer is not None + else WhitespaceTokenizer() + if qval == 0 + else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) + ) + + def sim(self, src, tar): + """Return the Positional Q-Gram Overlap coefficient of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Positional Q-Gram Overlap coefficient + + Examples + -------- + >>> cmp = PositionalQGramOverlap() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.4 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + src_list = self.params['tokenizer'].tokenize(src).get_list() + tar_list = self.params['tokenizer'].tokenize(tar).get_list() + + src_pos = defaultdict(list) + tar_pos = defaultdict(list) + + intersection = 0 + + for pos in range(len(src_list)): + src_pos[src_list[pos]].append(pos) + for pos in range(len(tar_list)): + tar_pos[tar_list[pos]].append(pos) + + src_matched = [] + tar_matched = [] + + for tok in src_pos: + if tok in tar_pos: + for sp in src_pos[tok]: + for tp in tar_pos[tok]: + if ( + abs(sp - tp) <= self._max_dist + and sp not in src_matched + and tp not in tar_matched + ): + intersection += 1 + src_matched.append(sp) + tar_matched.append(tp) + + denom = min(len(src_list), len(tar_list)) + + return intersection / denom + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_prefix.py b/abydos/distance/_prefix.py index 841491fa4..280ce3823 100644 --- a/abydos/distance/_prefix.py +++ b/abydos/distance/_prefix.py @@ -28,15 +28,21 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = ['Prefix', 'dist_prefix', 'sim_prefix'] class Prefix(_Distance): - """Prefix similiarity and distance.""" + """Prefix similiarity and distance. + + .. versionadded:: 0.3.6 + """ def sim(self, src, tar): """Return the prefix similarity of two strings. @@ -69,6 +75,11 @@ def sim(self, src, tar): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 1.0 @@ -82,6 +93,12 @@ def sim(self, src, tar): return 0.0 +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Prefix.sim method instead.', +) def sim_prefix(src, tar): """Return the prefix similarity of two strings. @@ -110,10 +127,18 @@ def sim_prefix(src, tar): >>> sim_prefix('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ return Prefix().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Prefix.dist method instead.', +) def dist_prefix(src, tar): """Return the prefix distance between two strings. @@ -142,6 +167,8 @@ def dist_prefix(src, tar): >>> dist_prefix('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ return Prefix().dist(src, tar) diff --git a/abydos/distance/_q_gram.py b/abydos/distance/_q_gram.py new file mode 100644 index 000000000..631b40445 --- /dev/null +++ b/abydos/distance/_q_gram.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._q_gram. + +q-gram distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance +from ..tokenizer import QGrams as QGramTokenizer + +__all__ = ['QGram'] + + +class QGram(_TokenDistance): + r"""q-gram distance. + + For two multisets X and Y, q-gram distance + :cite:`Ukkonen:1992` is + + .. math:: + + sim_{QGram}(X, Y) = |X \triangle Y| + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{QGram} = b+c + + Notes + ----- + This class uses bigrams without appended start or stop symbols, by default, + as in :cite:`Ukkonen:1992`'s examples. It is described as the :math:`L_1` + norm of the difference of two strings' q-gram profiles, which are the + vectors of q-gram occurrences. But this norm is simply the symmetric + difference of the two multisets. + + There aren't any limitations on which tokenizer is used with this class, + but, as the name would imply, q-grams are expected and the default. + + The normalized form uses the union of X and Y, making it equivalent to the + Jaccard distance :py:class:`.Jaccard`, but the Jaccard class, by default + uses bigrams with start & stop symbols. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize QGram instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + if tokenizer is None: + if 'qval' in kwargs: + qval = kwargs['qval'] + else: + qval = 2 + tokenizer = QGramTokenizer(qval=qval, start_stop='') + + super(QGram, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def dist_abs(self, src, tar): + """Return the q-gram distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + int + q-gram distance + + Examples + -------- + >>> cmp = QGram() + >>> cmp.dist_abs('cat', 'hat') + 2 + >>> cmp.dist_abs('Niall', 'Neil') + 7 + >>> cmp.dist_abs('aluminum', 'Catalan') + 11 + >>> cmp.dist_abs('ATCG', 'TAGC') + 6 + >>> cmp.dist_abs('01000', '001111') + 5 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + + return b + c + + def dist(self, src, tar): + """Return the normalized q-gram distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + q-gram distance + + Examples + -------- + >>> cmp = QGram() + >>> cmp.sim('cat', 'hat') + 0.33333333333333337 + >>> cmp.sim('Niall', 'Neil') + 0.0 + >>> cmp.sim('aluminum', 'Catalan') + 0.08333333333333337 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + num = self.dist_abs(src, tar) + if num: + return num / self._union_card() + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_quantitative_cosine.py b/abydos/distance/_quantitative_cosine.py new file mode 100644 index 000000000..fcd27ead4 --- /dev/null +++ b/abydos/distance/_quantitative_cosine.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._quantitative_cosine. + +Quantitative Cosine similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['QuantitativeCosine'] + + +class QuantitativeCosine(_TokenDistance): + r"""Quantitative Cosine similarity. + + For two multisets X and Y drawn from an alphabet S, Quantitative Cosine + similarity is + + .. math:: + + sim_{QuantitativeCosine}(X, Y) = + \frac{\sum_{i \in S} X_iY_i} + {\sqrt{\sum_{i \in S} X_i^2}\sqrt{\sum_{i \in S} Y_i^2}} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize QuantitativeCosine instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(QuantitativeCosine, self).__init__(tokenizer=tokenizer, **kwargs) + + def sim(self, src, tar): + """Return the Quantitative Cosine similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Quantitative Cosine similarity + + Examples + -------- + >>> cmp = QuantitativeCosine() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3651483716701107 + >>> cmp.sim('aluminum', 'Catalan') + 0.10660035817780521 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + if not self._src_card() or not self._tar_card(): + return 0.0 + + alphabet = self._total().keys() + + return sum( + self._src_tokens[tok] * self._tar_tokens[tok] for tok in alphabet + ) / ( + sum( + self._src_tokens[tok] * self._src_tokens[tok] + for tok in alphabet + ) + ** 0.5 + * sum( + self._tar_tokens[tok] * self._tar_tokens[tok] + for tok in alphabet + ) + ** 0.5 + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_quantitative_dice.py b/abydos/distance/_quantitative_dice.py new file mode 100644 index 000000000..43e05b6e4 --- /dev/null +++ b/abydos/distance/_quantitative_dice.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._quantitative_dice. + +Quantitative Dice similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['QuantitativeDice'] + + +class QuantitativeDice(_TokenDistance): + r"""Quantitative Dice similarity. + + For two multisets X and Y drawn from an alphabet S, Quantitative Dice + similarity is + + .. math:: + + sim_{QuantitativeDice}(X, Y) = + \frac{2 \cdot \sum_{i \in S} X_iY_i} + {\sum_{i \in S} X_i^2 + \sum_{i \in S} Y_i^2} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize QuantitativeDice instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(QuantitativeDice, self).__init__(tokenizer=tokenizer, **kwargs) + + def sim(self, src, tar): + """Return the Quantitative Dice similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Quantitative Dice similarity + + Examples + -------- + >>> cmp = QuantitativeDice() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36363636363636365 + >>> cmp.sim('aluminum', 'Catalan') + 0.10526315789473684 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + alphabet = self._total().keys() + + return ( + 2 + * sum( + self._src_tokens[tok] * self._tar_tokens[tok] + for tok in alphabet + ) + / ( + sum( + self._src_tokens[tok] * self._src_tokens[tok] + for tok in alphabet + ) + + sum( + self._tar_tokens[tok] * self._tar_tokens[tok] + for tok in alphabet + ) + ) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_quantitative_jaccard.py b/abydos/distance/_quantitative_jaccard.py new file mode 100644 index 000000000..6cfb84270 --- /dev/null +++ b/abydos/distance/_quantitative_jaccard.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._quantitative_jaccard. + +Quantitative Jaccard similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['QuantitativeJaccard'] + + +class QuantitativeJaccard(_TokenDistance): + r"""Quantitative Jaccard similarity. + + For two multisets X and Y drawn from an alphabet S, Quantitative Jaccard + similarity is + + .. math:: + + sim_{QuantitativeJaccard}(X, Y) = + \frac{\sum_{i \in S} X_iY_i} + {\sum_{i \in S} X_i^2 + \sum_{i \in S} Y_i^2 - + \sum_{i \in S} X_iY_i} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize QuantitativeJaccard instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(QuantitativeJaccard, self).__init__( + tokenizer=tokenizer, **kwargs + ) + + def sim(self, src, tar): + """Return the Quantitative Jaccard similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Quantitative Jaccard similarity + + Examples + -------- + >>> cmp = QuantitativeJaccard() + >>> cmp.sim('cat', 'hat') + 0.3333333333333333 + >>> cmp.sim('Niall', 'Neil') + 0.2222222222222222 + >>> cmp.sim('aluminum', 'Catalan') + 0.05555555555555555 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + alphabet = self._total().keys() + + product = sum( + self._src_tokens[tok] * self._tar_tokens[tok] for tok in alphabet + ) + + return product / ( + sum( + self._src_tokens[tok] * self._src_tokens[tok] + for tok in alphabet + ) + + sum( + self._tar_tokens[tok] * self._tar_tokens[tok] + for tok in alphabet + ) + - product + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_ratcliff_obershelp.py b/abydos/distance/_ratcliff_obershelp.py index 86d14bb8e..1c9ac4ebd 100644 --- a/abydos/distance/_ratcliff_obershelp.py +++ b/abydos/distance/_ratcliff_obershelp.py @@ -28,12 +28,15 @@ unicode_literals, ) +from deprecation import deprecated + from numpy import int as np_int from numpy import zeros as np_zeros from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = [ 'RatcliffObershelp', @@ -58,6 +61,8 @@ class RatcliffObershelp(_Distance): Cf. http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970 + + .. versionadded:: 0.3.6 """ def sim(self, src, tar): @@ -87,6 +92,11 @@ def sim(self, src, tar): >>> cmp.sim('ATCG', 'TAGC') 0.5 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _lcsstr_stl(src, tar): @@ -106,6 +116,8 @@ def _lcsstr_stl(src, tar): target string, and length of the longest common substring of strings src and tar. + .. versionadded:: 0.1.0 + """ lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) longest, src_longest, tar_longest = 0, 0, 0 @@ -146,6 +158,8 @@ def _sstr_matches(src, tar): int Sum of substring match lengths + .. versionadded:: 0.1.0 + """ src_start, tar_start, length = _lcsstr_stl(src, tar) if length == 0: @@ -165,6 +179,12 @@ def _sstr_matches(src, tar): return 2 * _sstr_matches(src, tar) / (len(src) + len(tar)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RatcliffObershelp.sim method instead.', +) def sim_ratcliff_obershelp(src, tar): """Return the Ratcliff-Obershelp similarity of two strings. @@ -193,10 +213,18 @@ def sim_ratcliff_obershelp(src, tar): >>> sim_ratcliff_obershelp('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + """ return RatcliffObershelp().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RatcliffObershelp.dist method instead.', +) def dist_ratcliff_obershelp(src, tar): """Return the Ratcliff-Obershelp distance between two strings. @@ -225,6 +253,8 @@ def dist_ratcliff_obershelp(src, tar): >>> dist_ratcliff_obershelp('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.1.0 + """ return RatcliffObershelp().dist(src, tar) diff --git a/abydos/distance/_rees_levenshtein.py b/abydos/distance/_rees_levenshtein.py new file mode 100644 index 000000000..faf7cc73f --- /dev/null +++ b/abydos/distance/_rees_levenshtein.py @@ -0,0 +1,269 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._rees_levenshtein. + +Rees-Levenshtein distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from numpy import int as np_int +from numpy import zeros as np_zeros + +from ._distance import _Distance + +__all__ = ['ReesLevenshtein'] + + +class ReesLevenshtein(_Distance): + r"""Rees-Levenshtein distance. + + Rees-Levenshtein distance :cite:`Rees:2014,Rees:2013` is the "Modified + Damerau-Levenshtein Distance Algorithm, created by Tony Rees as part of + Taxamatch. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, block_limit=2, normalizer=max, **kwargs): + """Initialize ReesLevenshtein instance. + + Parameters + ---------- + block_limit : int + The block length limit + normalizer : function + A function that takes an list and computes a normalization term + by which the edit distance is divided (max by default). Another + good option is the sum function. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(ReesLevenshtein, self).__init__(**kwargs) + self._normalizer = normalizer + self._block_limit = block_limit + + def dist_abs(self, src, tar): + """Return the Rees-Levenshtein distance of two strings. + + This is a straightforward port of the PL/SQL implementation at + https://confluence.csiro.au/public/taxamatch/the-mdld-modified-damerau-levenshtein-distance-algorithm + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Rees-Levenshtein distance + + Examples + -------- + >>> cmp = ReesLevenshtein() + >>> cmp.dist_abs('cat', 'hat') + 1 + >>> cmp.dist_abs('Niall', 'Neil') + 3 + >>> cmp.dist_abs('aluminum', 'Catalan') + 7 + >>> cmp.dist_abs('ATCG', 'TAGC') + 2 + + + .. versionadded:: 0.4.0 + + """ + v_str1_length = len(src) + v_str2_length = len(tar) + + if tar == src: + return 0 + if not src: + return len(tar) + if not tar: + return len(src) + if v_str1_length == 1 and v_str2_length == 1: + return 1 + + def _substr(string, start, length): + if start > 0: + start -= 1 + else: + start += len(string) - 1 + + end = start + length + + return string[start:end] + + v_temp_str1 = str(src) + v_temp_str2 = str(tar) + + # first trim common leading characters + while v_temp_str1[:1] == v_temp_str2[:1]: + v_temp_str1 = v_temp_str1[1:] + v_temp_str2 = v_temp_str2[1:] + + # then trim common trailing characters + while v_temp_str1[-1:] == v_temp_str2[-1:]: + v_temp_str1 = v_temp_str1[:-1] + v_temp_str2 = v_temp_str2[:-1] + + v_str1_length = len(v_temp_str1) + v_str2_length = len(v_temp_str2) + + # then calculate standard Levenshtein Distance + if v_str1_length == 0 or v_str2_length == 0: + return max(v_str2_length, v_str1_length) + if v_str1_length == 1 and v_str2_length == 1: + return 1 + + # create table (NB: this is transposed relative to the PL/SQL version) + d_mat = np_zeros((v_str1_length + 1, v_str2_length + 1), dtype=np_int) + + # enter values in first (leftmost) column + for i in range(1, v_str1_length + 1): + d_mat[i, 0] = i + # populate remaining columns + for j in range(1, v_str2_length + 1): + d_mat[0, j] = j + + for i in range(1, v_str1_length + 1): + if v_temp_str1[i - 1] == v_temp_str2[j - 1]: + v_this_cost = 0 + else: + v_this_cost = 1 + + # extension to cover multiple single, double, triple, etc. + # character transpositions + # that includes calculation of original Levenshtein distance + # when no transposition found + v_temp_block_length = int( + min( + v_str1_length / 2, v_str2_length / 2, self._block_limit + ) + ) + + while v_temp_block_length >= 1: + if ( + (i >= v_temp_block_length * 2) + and (j >= v_temp_block_length * 2) + and ( + _substr( + v_temp_str1, + i - v_temp_block_length * 2 - 1, + v_temp_block_length, + ) + == _substr( + v_temp_str2, + j - v_temp_block_length - 1, + v_temp_block_length, + ) + ) + and ( + _substr( + v_temp_str1, + i - v_temp_block_length - 1, + v_temp_block_length, + ) + == _substr( + v_temp_str2, + j - v_temp_block_length * 2 - 1, + v_temp_block_length, + ) + ) + ): + # transposition found + d_mat[i, j] = min( + d_mat[i, j - 1] + 1, + d_mat[i - 1, j] + 1, + d_mat[ + i - v_temp_block_length * 2, + j - v_temp_block_length * 2, + ] + + v_this_cost + + v_temp_block_length + - 1, + ) + v_temp_block_length = 0 + elif v_temp_block_length == 1: + # no transposition + d_mat[i, j] = min( + d_mat[i, j - 1] + 1, + d_mat[i - 1, j] + 1, + d_mat[i - 1, j - 1] + v_this_cost, + ) + v_temp_block_length -= 1 + + return d_mat[v_str1_length, v_str2_length] + + def dist(self, src, tar): + """Return the normalized Rees-Levenshtein distance of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + Normalized Rees-Levenshtein distance + + Examples + -------- + >>> cmp = ReesLevenshtein() + >>> cmp.dist('cat', 'hat') + 0.3333333333333333 + >>> cmp.dist('Niall', 'Neil') + 0.6 + >>> cmp.dist('aluminum', 'Catalan') + 0.875 + >>> cmp.dist('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + return self.dist_abs(src, tar) / ( + self._normalizer([len(src), len(tar)]) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_roberts.py b/abydos/distance/_roberts.py new file mode 100644 index 000000000..4eaa3664a --- /dev/null +++ b/abydos/distance/_roberts.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._roberts. + +Roberts similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Roberts'] + + +class Roberts(_TokenDistance): + r"""Roberts similarity. + + For two multisets X and Y drawn from an alphabet S, Roberts similarity + :cite:`Roberts:1986` is + + .. math:: + + sim_{Roberts}(X, Y) = + \frac{\Big[\sum_{i \in S} (X_i + Y_i) \cdot + \frac{min(X_i, Y_i)}{max(X_i, Y_i)}\Big]} + {\sum_{i \in S} (X_i + Y_i)} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Roberts instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Roberts, self).__init__(tokenizer=tokenizer, **kwargs) + + def sim(self, src, tar): + """Return the Roberts similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Roberts similarity + + Examples + -------- + >>> cmp = Roberts() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36363636363636365 + >>> cmp.sim('aluminum', 'Catalan') + 0.11764705882352941 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + alphabet = self._total().keys() + + return sum( + (self._src_tokens[i] + self._tar_tokens[i]) + * min(self._src_tokens[i], self._tar_tokens[i]) + / max(self._src_tokens[i], self._tar_tokens[i]) + for i in alphabet + ) / sum((self._src_tokens[i] + self._tar_tokens[i]) for i in alphabet) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_rogers_tanimoto.py b/abydos/distance/_rogers_tanimoto.py new file mode 100644 index 000000000..3087541c6 --- /dev/null +++ b/abydos/distance/_rogers_tanimoto.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._rogers_tanimoto. + +Rogers & Tanimoto similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['RogersTanimoto'] + + +class RogersTanimoto(_TokenDistance): + r"""Rogers & Tanimoto similarity. + + For two sets X and Y and a population N, the Rogers-Tanimoto similarity + :cite:`Rogers:1960` is + + .. math:: + + sim_{RogersTanimoto}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y|} + {|X \setminus Y| + |Y \setminus X| + |N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{RogersTanimoto} = + \frac{a+d}{b+c+n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize RogersTanimoto instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(RogersTanimoto, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Rogers & Tanimoto similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Rogers & Tanimoto similarity + + Examples + -------- + >>> cmp = RogersTanimoto() + >>> cmp.sim('cat', 'hat') + 0.9898477157360406 + >>> cmp.sim('Niall', 'Neil') + 0.9823008849557522 + >>> cmp.sim('aluminum', 'Catalan') + 0.9625 + >>> cmp.sim('ATCG', 'TAGC') + 0.9748110831234257 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return (a + d) / (b + c + n) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_rogot_goldberg.py b/abydos/distance/_rogot_goldberg.py new file mode 100644 index 000000000..e181b80de --- /dev/null +++ b/abydos/distance/_rogot_goldberg.py @@ -0,0 +1,164 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._rogot_goldberg. + +Rogot & Goldberg similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['RogotGoldberg'] + + +class RogotGoldberg(_TokenDistance): + r"""Rogot & Goldberg similarity. + + For two sets X and Y and a population N, Rogot & Goldberg's + "second index adjusted agreement" :math:`A_2` :cite:`Rogot:1966` is + + .. math:: + + sim_{RogotGoldberg}(X, Y) = + \frac{1}{2}\Bigg( + \frac{2|X \cap Y|}{|X|+|Y|} + + \frac{2|(N \setminus X) \setminus Y|} + {|N \setminus X|+|N \setminus Y|} + \Bigg) + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{RogotGoldberg} = + \frac{1}{2}\Bigg( + \frac{2a}{2a+b+c} + + \frac{2d}{2d+b+c} + \Bigg) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize RogotGoldberg instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(RogotGoldberg, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Rogot & Goldberg similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Rogot & Goldberg similarity + + Examples + -------- + >>> cmp = RogotGoldberg() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6795702691656449 + >>> cmp.sim('aluminum', 'Catalan') + 0.5539941668876179 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + p1 = a / (2 * a + b + c) if a else 0 + p2 = d / (2 * d + b + c) if d else 0 + + return p1 + p2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_rouge_l.py b/abydos/distance/_rouge_l.py new file mode 100644 index 000000000..12a2c287b --- /dev/null +++ b/abydos/distance/_rouge_l.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._rouge_l. + +Rouge-L similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from . import LCSseq +from ._distance import _Distance + +__all__ = ['RougeL'] + + +class RougeL(_Distance): + r"""Rouge-L similarity. + + Rouge-L similarity :cite:`Lin:2004` + + .. versionadded:: 0.4.0 + """ + + _lcs = LCSseq() + + def __init__(self, **kwargs): + """Initialize RougeL instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(RougeL, self).__init__(**kwargs) + + def sim(self, src, tar, beta=8): + """Return the Rouge-L similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + beta : int or float + A weighting factor to prejudice similarity towards src + + Returns + ------- + float + Rouge-L similarity + + Examples + -------- + >>> cmp = RougeL() + >>> cmp.sim('cat', 'hat') + 0.6666666666666666 + >>> cmp.sim('Niall', 'Neil') + 0.6018518518518519 + >>> cmp.sim('aluminum', 'Catalan') + 0.3757225433526012 + >>> cmp.sim('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + lcs_len = len(self._lcs.lcsseq(src, tar)) + r_lcs = lcs_len / len(src) + p_lcs = lcs_len / len(tar) + beta_sq = beta * beta + + if r_lcs and p_lcs: + return (1 + beta_sq) * r_lcs * p_lcs / (r_lcs + beta_sq * p_lcs) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_rouge_s.py b/abydos/distance/_rouge_s.py new file mode 100644 index 000000000..92cdaded1 --- /dev/null +++ b/abydos/distance/_rouge_s.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._rouge_s. + +Rouge-S similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._distance import _Distance +from ..tokenizer import QSkipgrams +from ..util._ncr import _ncr + +__all__ = ['RougeS'] + + +class RougeS(_Distance): + r"""Rouge-S similarity. + + Rouge-S similarity :cite:`Lin:2004`, operating on character-level skipgrams + + .. versionadded:: 0.4.0 + """ + + def __init__(self, qval=2, **kwargs): + """Initialize RougeS instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(RougeS, self).__init__(**kwargs) + self._qval = qval + self._tokenizer = QSkipgrams(qval=qval, start_stop='') + + def sim(self, src, tar, beta=8): + """Return the Rouge-S similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + beta : int or float + A weighting factor to prejudice similarity towards src + + Returns + ------- + float + Rouge-S similarity + + Examples + -------- + >>> cmp = RougeS() + >>> cmp.sim('cat', 'hat') + 0.3333333333333333 + >>> cmp.sim('Niall', 'Neil') + 0.30185758513931893 + >>> cmp.sim('aluminum', 'Catalan') + 0.10755653612796467 + >>> cmp.sim('ATCG', 'TAGC') + 0.6666666666666666 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + qsg_src = self._tokenizer.tokenize(src).get_counter() + qsg_tar = self._tokenizer.tokenize(tar).get_counter() + intersection = sum((qsg_src & qsg_tar).values()) + + if intersection: + r_skip = intersection / _ncr(len(src), self._qval) + p_skip = intersection / _ncr(len(tar), self._qval) + else: + return 0.0 + + beta_sq = beta * beta + + return (1 + beta_sq) * r_skip * p_skip / (r_skip + beta_sq * p_skip) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_rouge_su.py b/abydos/distance/_rouge_su.py new file mode 100644 index 000000000..7f0961625 --- /dev/null +++ b/abydos/distance/_rouge_su.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._rouge_su. + +Rouge-SU similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from . import RougeS + +__all__ = ['RougeSU'] + + +class RougeSU(RougeS): + r"""Rouge-SU similarity. + + Rouge-SU similarity :cite:`Lin:2004`, operating on character-level + skipgrams + + .. versionadded:: 0.4.0 + """ + + def __init__(self, qval=2, **kwargs): + """Initialize RougeSU instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(RougeSU, self).__init__(qval=qval, **kwargs) + + def sim(self, src, tar, beta=8): + """Return the Rouge-SU similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + beta : int or float + A weighting factor to prejudice similarity towards src + + Returns + ------- + float + Rouge-SU similarity + + Examples + -------- + >>> cmp = RougeSU() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.4020618556701031 + >>> cmp.sim('aluminum', 'Catalan') + 0.1672384219554031 + >>> cmp.sim('ATCG', 'TAGC') + 0.8 + + + .. versionadded:: 0.4.0 + + """ + return super(RougeSU, self).sim( + '$' * (self._qval - 1) + src, '$' * (self._qval - 1) + tar, beta + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_rouge_w.py b/abydos/distance/_rouge_w.py new file mode 100644 index 000000000..e67c0a45e --- /dev/null +++ b/abydos/distance/_rouge_w.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._rouge_w. + +Rouge-W similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from numpy import int as np_int +from numpy import zeros as np_zeros + +from ._distance import _Distance + +__all__ = ['RougeW'] + + +class RougeW(_Distance): + r"""Rouge-W similarity. + + Rouge-W similarity :cite:`Lin:2004` + + .. versionadded:: 0.4.0 + """ + + def __init__(self, f_func=None, f_inv=None, **kwargs): + """Initialize RougeW instance. + + Parameters + ---------- + f_func : function + A weighting function based on the value supplied to this function, + such that f(x+y) > f(x) + f(y) + f_inv : function + The close form inverse of f_func + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(RougeW, self).__init__(**kwargs) + self._f_func = f_func + self._f_inv = f_inv + + if self._f_func is None: + self._f_func = RougeW._square # noqa: SF01 + if self._f_inv is None: + self._f_inv = RougeW._sqrt # noqa: SF01 + + @staticmethod + def _square(n): + return n * n + + @staticmethod + def _sqrt(n): + return n ** 0.5 + + def wlcs(self, src, tar): + """Return the Rouge-W weighted longest common sub-sequence length. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int (may return a float if cost has float values) + The Levenshtein distance between src & tar + + Examples + -------- + >>> cmp = RougeW() + >>> cmp.wlcs('cat', 'hat') + 4 + >>> cmp.wlcs('Niall', 'Neil') + 3 + >>> cmp.wlcs('aluminum', 'Catalan') + 5 + >>> cmp.wlcs('ATCG', 'TAGC') + 3 + + .. versionadded:: 0.4.0 + + """ + src_len = len(src) + tar_len = len(tar) + + if src == tar: + return self._f_func(len(src)) + if not src: + return 0 + if not tar: + return 0 + + c_mat = np_zeros((src_len, tar_len), dtype=np_int) + w_mat = np_zeros((src_len, tar_len), dtype=np_int) + + for i in range(src_len): + for j in range(tar_len): + if src[i] == tar[j]: + k = w_mat[i - 1, j - 1] + c_mat[i, j] = ( + c_mat[i - 1, j - 1] + + self._f_func(k + 1) + - self._f_func(k) + ) + w_mat[i, j] = k + 1 + else: + if c_mat[i - 1, j] > c_mat[i, j - 1]: + c_mat[i, j] = c_mat[i - 1, j] + w_mat[i, j] = 0 + else: + c_mat[i, j] = c_mat[i, j - 1] + w_mat[i, j] = 0 + + return c_mat[src_len - 1, tar_len - 1] + + def sim(self, src, tar, beta=8): + """Return the Rouge-W similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + beta : int or float + A weighting factor to prejudice similarity towards src + + Returns + ------- + float + Rouge-W similarity + + Examples + -------- + >>> cmp = RougeW() + >>> cmp.sim('cat', 'hat') + 0.6666666666666666 + >>> cmp.sim('Niall', 'Neil') + 0.34747932867894143 + >>> cmp.sim('aluminum', 'Catalan') + 0.280047049205176 + >>> cmp.sim('ATCG', 'TAGC') + 0.43301270189221935 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + wlcs = self.wlcs(src, tar) + r_wlcs = self._f_inv(wlcs / self._f_func(len(src))) + p_wlcs = self._f_inv(wlcs / self._f_func(len(tar))) + beta_sq = beta * beta + + if r_wlcs and p_wlcs: + return ( + (1 + beta_sq) * r_wlcs * p_wlcs / (r_wlcs + beta_sq * p_wlcs) + ) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_russell_rao.py b/abydos/distance/_russell_rao.py new file mode 100644 index 000000000..46afd654a --- /dev/null +++ b/abydos/distance/_russell_rao.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._russell_rao. + +Russell & Rao similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['RussellRao'] + + +class RussellRao(_TokenDistance): + r"""Russell & Rao similarity. + + For two sets X and Y and a population N, the Russell & Rao similarity + :cite:`Russell:1940` is + + .. math:: + + sim_{RussellRao}(X, Y) = \frac{|X \cap Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{RussellRao} = + \frac{a}{n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize RussellRao instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(RussellRao, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Russell & Rao similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Russell & Rao similarity + + Examples + -------- + >>> cmp = RussellRao() + >>> cmp.sim('cat', 'hat') + 0.002551020408163265 + >>> cmp.sim('Niall', 'Neil') + 0.002551020408163265 + >>> cmp.sim('aluminum', 'Catalan') + 0.0012738853503184713 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + n = self._population_unique_card() + + if a: + return a / n + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_saps.py b/abydos/distance/_saps.py new file mode 100644 index 000000000..270e7813f --- /dev/null +++ b/abydos/distance/_saps.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._saps_alignment. + +Syllable Alignment Pattern Searching tokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from numpy import int as np_int +from numpy import zeros as np_zeros + +from six.moves import range + +from ._distance import _Distance +from ..tokenizer import SAPSTokenizer + +__all__ = ['SAPS'] + + +class SAPS(_Distance): + """Syllable Alignment Pattern Searching tokenizer. + + This is the alignment and similarity calculation described on p. 917-918 of + :cite:`Ruibin:2005`. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + cost=(1, -1, -4, 6, -2, -1, -3), + normalizer=max, + tokenizer=None, + **kwargs + ): + """Initialize SAPS instance. + + Parameters + ---------- + cost : tuple + A 7-tuple representing the cost of the four possible matches: + + - syllable-internal match + - syllable-internal mis-match + - syllable-initial match or mismatch with syllable-internal + - syllable-initial match + - syllable-initial mis-match + - syllable-internal gap + - syllable-initial gap + + (by default: (1, -1, -4, 6, -2, -1, -3)) + normalizer : function + A function that takes an list and computes a normalization term + by which the edit distance is divided (max by default). Another + good option is the sum function. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(SAPS, self).__init__(**kwargs) + self._s1, self._s2, self._s3, self._s4, self._s5 = cost[:5] + self._g1, self._g2 = cost[5:] + + self._normalizer = normalizer + if tokenizer is None: + self._tokenizer = SAPSTokenizer() + else: + self._tokenizer = tokenizer + + def _s(self, src, tar): + if src.isupper(): + if tar.isupper(): + return self._s4 if src == tar else self._s5 + else: + return self._s3 + else: + if tar.islower(): + return self._s1 if src == tar else self._s2 + else: + return self._s3 + + def _g(self, ch): + if ch.isupper(): + return self._g2 + else: + return self._g1 + + def sim_score(self, src, tar): + """Return the SAPS similarity between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int + The SAPS similarity between src & tar + + Examples + -------- + >>> cmp = SAPS() + >>> cmp.sim_score('cat', 'hat') + 0 + >>> cmp.sim_score('Niall', 'Neil') + 3 + >>> cmp.sim_score('aluminum', 'Catalan') + -11 + >>> cmp.sim_score('ATCG', 'TAGC') + -1 + >>> cmp.sim_score('Stevenson', 'Stinson') + 16 + + + .. versionadded:: 0.4.0 + + """ + src = self._tokenizer.tokenize(src).get_list() + tar = self._tokenizer.tokenize(tar).get_list() + + src = ''.join([_[0].upper() + _[1:].lower() for _ in src]) + tar = ''.join([_[0].upper() + _[1:].lower() for _ in tar]) + + d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) + for i in range(len(src)): + d_mat[i + 1, 0] = d_mat[i, 0] + self._g(src[i]) + for j in range(len(tar)): + d_mat[0, j + 1] = d_mat[0, j] + self._g(tar[j]) + + for i in range(len(src)): + for j in range(len(tar)): + d_mat[i + 1, j + 1] = max( + d_mat[i, j + 1] + self._g(src[i]), # ins + d_mat[i + 1, j] + self._g(tar[j]), # del + d_mat[i, j] + self._s(src[i], tar[j]), # sub/== + ) + + return d_mat[len(src), len(tar)] + + def sim(self, src, tar): + """Return the normalized SAPS similarity between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The normalized SAPS similarity between src & tar + + Examples + -------- + >>> cmp = SAPS() + >>> round(cmp.sim('cat', 'hat'), 12) + 0.0 + >>> round(cmp.sim('Niall', 'Neil'), 12) + 0.2 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + score = self.sim_score(src, tar) + if score <= 0: + return 0.0 + + src = self._tokenizer.tokenize(src).get_list() + src_max = sum(5 + len(_) for _ in src) + tar = self._tokenizer.tokenize(tar).get_list() + tar_max = sum(5 + len(_) for _ in tar) + + return score / max(src_max, tar_max) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_scott_pi.py b/abydos/distance/_scott_pi.py new file mode 100644 index 000000000..12b834a54 --- /dev/null +++ b/abydos/distance/_scott_pi.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._scott_pi. + +Scott's Pi correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['ScottPi'] + + +class ScottPi(_TokenDistance): + r"""Scott's Pi correlation. + + For two sets X and Y and a population N, Scott's :math:`\pi` correlation + :cite:`Scott:1955` is + + .. math:: + + corr_{Scott_\pi}(X, Y) = \pi = + \frac{p_o - p_e^\pi}{1 - p_e^\pi} + + where + + .. math:: + + \begin{array}{ll} + p_o &= \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|} + + p_e^\pi &= \Big(\frac{|X| + |Y|}{2 \cdot |N|}\Big)^2 + + \Big(\frac{|N \setminus X| + |N \setminus Y|}{2 \cdot |N|}\Big)^2 + \end{array} + + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + \begin{array}{ll} + p_o &= \frac{a+d}{n} + + p_e^\pi &= \Big(\frac{2a+b+c}{2n}\Big)^2 + + \Big(\frac{2d+b+c}{2n}\Big)^2 + \end{array} + + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize ScottPi instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(ScottPi, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Scott's Pi correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Scott's Pi correlation + + Examples + -------- + >>> cmp = ScottPi() + >>> cmp.corr('cat', 'hat') + 0.49743589743589733 + >>> cmp.corr('Niall', 'Neil') + 0.35914053833129245 + >>> cmp.corr('aluminum', 'Catalan') + 0.10798833377524023 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237489689 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + po = (a + d) / n + pe = ((2 * a + b + c) / (2 * n)) ** 2 + ( + (2 * d + b + c) / (2 * n) + ) ** 2 + + if po != pe: + return (po - pe) / (1 - pe) + return 0.0 + + def sim(self, src, tar): + """Return the Scott's Pi similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Scott's Pi similarity + + Examples + -------- + >>> cmp = ScottPi() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6795702691656462 + >>> cmp.sim('aluminum', 'Catalan') + 0.5539941668876202 + >>> cmp.sim('ATCG', 'TAGC') + 0.49679075738125517 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_shape.py b/abydos/distance/_shape.py new file mode 100644 index 000000000..2c9838735 --- /dev/null +++ b/abydos/distance/_shape.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._shape_difference. + +Penrose's shape difference +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Shape'] + + +class Shape(_TokenDistance): + r"""Penrose's shape difference. + + For two sets X and Y and a population N, the Penrose's shape difference + :cite:`Penrose:1952` is + + .. math:: + + dist_{Shape}(X, Y) = + \frac{1}{|N|}\cdot\Big(\sum_{x \in (X \triangle Y)} x^2\Big) - + \Big(\frac{|X \triangle Y|}{|N|}\Big)^2 + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Shape} = + \frac{1}{n}\Big(\sum_{x \in b} x^2 + \sum_{x \in c} x^2\Big) - + \Big(\frac{b+c}{n}\Big)^2 + + In :cite:`IBM:2017`, the formula is instead + :math:`\frac{n(b+c)-(b-c)^2}{n^2}`, but it is clear from + :cite:`Penrose:1952` that this should not be an assymmetric value with + respect to the ordering of the two sets, among other errors in this + formula. Meanwhile, :cite:`Deza:2016` gives the formula + :math:`\sqrt{\sum((x_i-\bar{x})-(y_i-\bar{y}))^2}`. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Shape instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Shape, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Penrose's shape difference of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Shape ifference + + Examples + -------- + >>> cmp = Shape() + >>> cmp.sim('cat', 'hat') + 0.994923990004165 + >>> cmp.sim('Niall', 'Neil') + 0.9911511479591837 + >>> cmp.sim('aluminum', 'Catalan') + 0.9787090754188811 + >>> cmp.sim('ATCG', 'TAGC') + 0.9874075905872554 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + symdiff = self._symmetric_difference().values() + + dist = sum(symdiff) + dist_sq = sum(_ ** 2 for _ in symdiff) + n = self._population_unique_card() + + return dist_sq / n - (dist / n) ** 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_shapira_storer_i.py b/abydos/distance/_shapira_storer_i.py new file mode 100644 index 000000000..150c6b2d8 --- /dev/null +++ b/abydos/distance/_shapira_storer_i.py @@ -0,0 +1,261 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._shapira_storer_i. + +Shapira & Storer I edit distance with block moves, greedy algorithm +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import Counter + +from numpy import int as np_int +from numpy import zeros as np_zeros + +from six.moves import range + +from ._lcsstr import LCSstr +from ._levenshtein import Levenshtein + +__all__ = ['ShapiraStorerI'] + + +class ShapiraStorerI(Levenshtein): + """Shapira & Storer I edit distance with block moves, greedy algorithm. + + Shapira & Storer's greedy edit distance :cite:`Shapira:2007` is similar to + Levenshtein edit distance, but with two important distinctions: + + - It considers blocks of characters, if they occur in both the source + and target strings, so the edit distance between 'abcab' and 'abc' + is only 1, since the substring 'ab' occurs in both and can be + inserted as a block into 'abc'. + - It allows three edit operations: insert, delete, and move (but not + substitute). Thus the distance between 'abcde' and 'deabc' is only 1 + because the block 'abc' can be moved in 1 move operation, rather than + being deleted and inserted in 2 separate operations. + + If prime is set to True at initialization, this employs the greedy' + algorithm, which limits replacements of blocks in the two strings to + matching occurrences of the LCS. + + .. versionadded:: 0.4.0 + """ + + _lcs = LCSstr() + + def __init__(self, prime=False, **kwargs): + """Initialize ShapiraStorerI instance. + + Parameters + ---------- + prime : bool + If True, employs the greedy' algorithm rather than greedy + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + self._prime = prime + super(ShapiraStorerI, self).__init__(**kwargs) + + def dist_abs(self, src, tar): + """Return the Shapira & Storer I edit distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int + The Shapira & Storer I edit distance between src & tar + + Examples + -------- + >>> cmp = ShapiraStorerI() + >>> cmp.dist_abs('cat', 'hat') + 2 + >>> cmp.dist_abs('Niall', 'Neil') + 3 + >>> cmp.dist_abs('aluminum', 'Catalan') + 9 + >>> cmp.dist_abs('ATCG', 'TAGC') + 2 + + + .. versionadded:: 0.4.0 + + """ + alphabet = set(src) | set(tar) + next_char = 'A' + lcs = self._lcs.lcsstr(src, tar) + while len(lcs) > 1: + while next_char in alphabet: + next_char = chr(ord(next_char) + 1) + if self._prime: + count = min(src.count(lcs), tar.count(lcs)) + src = src.replace(lcs, next_char, count) + tar = tar.replace(lcs, next_char, count) + else: + src = src.replace(lcs, next_char) + tar = tar.replace(lcs, next_char) + alphabet |= {next_char} + lcs = self._lcs.lcsstr(src, tar) + + d = self._edit_with_moves(src, tar) + return d + + def _edit_with_moves(self, src, tar): + """Return the edit distance between two strings using ins, del, & move. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int + The Levenshtein distance between src & tar + + + .. versionadded:: 0.4.0 + + """ + ins_cost, del_cost = self._cost[:2] + + if src == tar: + return 0 + if not src: + return len(tar) * ins_cost + if not tar: + return len(src) * del_cost + + d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) + for i in range(len(src) + 1): + d_mat[i, 0] = i * del_cost + for j in range(len(tar) + 1): + d_mat[0, j] = j * ins_cost + + for i in range(len(src)): + for j in range(len(tar)): + d_mat[i + 1, j + 1] = min( + d_mat[i + 1, j] + ins_cost, # ins + d_mat[i, j + 1] + del_cost, # del + d_mat[i, j] + + (float('inf') if src[i] != tar[j] else 0), # sub/== + ) + + distance = d_mat[len(src), len(tar)] + + # Do a backtrace on d_mat to discover an optimal path & count + # inserted & deleted characters + i = len(src) + j = len(tar) + inserts = Counter() + deletes = Counter() + while i > 0 and j > 0: + ante = [d_mat[i - 1, j - 1], d_mat[i - 1, j], d_mat[i, j - 1]] + least = ante.index(min(ante)) + old_dist = d_mat[i, j] + if least == 0: + i -= 1 + j -= 1 + if d_mat[i, j] < old_dist: + deletes[src[i]] += 1 + inserts[tar[j]] += 1 + elif least == 1: + i -= 1 + if d_mat[i, j] < old_dist: + deletes[src[i]] += 1 + else: + j -= 1 + if d_mat[i, j] < old_dist: + inserts[tar[j]] += 1 + while i > 0: + i -= 1 + if d_mat[i, j] < old_dist: + deletes[src[i]] += 1 + while j > 0: + j -= 1 + if d_mat[i, j] < old_dist: + inserts[tar[j]] += 1 + + moves = sum((inserts & deletes).values()) + + return distance - moves + + def dist(self, src, tar): + """Return the normalized Shapira & Storer I distance. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The normalized Shapira & Storer I distance between src & tar + + Examples + -------- + >>> cmp = ShapiraStorerI() + >>> round(cmp.dist('cat', 'hat'), 12) + 0.333333333333 + >>> round(cmp.dist('Niall', 'Neil'), 12) + 0.333333333333 + >>> cmp.dist('aluminum', 'Catalan') + 0.6 + >>> cmp.dist('ATCG', 'TAGC') + 0.25 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + ins_cost, del_cost = self._cost[:2] + src_len = len(src) + tar_len = len(tar) + return self.dist_abs(src, tar) / ( + sum([src_len * del_cost, tar_len * ins_cost]) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sift4.py b/abydos/distance/_sift4.py index 48bba1168..530123202 100644 --- a/abydos/distance/_sift4.py +++ b/abydos/distance/_sift4.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = ['Sift4', 'dist_sift4', 'sift4_common', 'sim_sift4'] @@ -40,9 +43,31 @@ class Sift4(_Distance): This is an approximation of edit distance, described in :cite:`Zackwehdex:2014`. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, max_offset=5, max_distance=0): + def __init__(self, max_offset=5, max_distance=0, **kwargs): + """Initialize Sift4 instance. + + Parameters + ---------- + max_offset : int + The number of characters to search for matching letters + max_distance : int + The distance at which to stop and exit + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Sift4, self).__init__(**kwargs) + self._max_offset = max_offset + self._max_distance = max_distance + + def dist_abs(self, src, tar): """Return the "common" Sift4 distance between two terms. Parameters @@ -51,10 +76,6 @@ def dist_abs(self, src, tar, max_offset=5, max_distance=0): Source string for comparison tar : str Target string for comparison - max_offset : int - The number of characters to search for matching letters - max_distance : int - The distance at which to stop and exit Returns ------- @@ -73,6 +94,11 @@ def dist_abs(self, src, tar, max_offset=5, max_distance=0): >>> cmp.dist_abs('ATCG', 'TAGC') 2 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if not src: return len(tar) @@ -120,7 +146,7 @@ def dist_abs(self, src, tar, max_offset=5, max_distance=0): local_cs = 0 if src_cur != tar_cur: src_cur = tar_cur = min(src_cur, tar_cur) - for i in range(max_offset): + for i in range(self._max_offset): if not ( (src_cur + i < src_len) or (tar_cur + i < tar_len) ): @@ -141,9 +167,9 @@ def dist_abs(self, src, tar, max_offset=5, max_distance=0): src_cur += 1 tar_cur += 1 - if max_distance: + if self._max_distance: temporary_distance = max(src_cur, tar_cur) - lcss + trans - if temporary_distance >= max_distance: + if temporary_distance >= self._max_distance: return round(temporary_distance) if (src_cur >= src_len) or (tar_cur >= tar_len): @@ -154,7 +180,7 @@ def dist_abs(self, src, tar, max_offset=5, max_distance=0): lcss += local_cs return round(max(src_len, tar_len) - lcss + trans) - def dist(self, src, tar, max_offset=5, max_distance=0): + def dist(self, src, tar): """Return the normalized "common" Sift4 distance between two terms. This is Sift4 distance, normalized to [0, 1]. @@ -165,10 +191,6 @@ def dist(self, src, tar, max_offset=5, max_distance=0): Source string for comparison tar : str Target string for comparison - max_offset : int - The number of characters to search for matching letters - max_distance : int - The distance at which to stop and exit Returns ------- @@ -187,12 +209,21 @@ def dist(self, src, tar, max_offset=5, max_distance=0): >>> cmp.dist('ATCG', 'TAGC') 0.5 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return self.dist_abs(src, tar, max_offset, max_distance) / ( - max(len(src), len(tar), 1) - ) + return self.dist_abs(src, tar) / (max(len(src), len(tar), 1)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Sift4.dist_abs method instead.', +) def sift4_common(src, tar, max_offset=5, max_distance=0): """Return the "common" Sift4 distance between two terms. @@ -225,10 +256,18 @@ def sift4_common(src, tar, max_offset=5, max_distance=0): >>> sift4_common('ATCG', 'TAGC') 2 + .. versionadded:: 0.3.0 + """ - return Sift4().dist_abs(src, tar, max_offset, max_distance) + return Sift4(max_offset, max_distance).dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Sift4.dist method instead.', +) def dist_sift4(src, tar, max_offset=5, max_distance=0): """Return the normalized "common" Sift4 distance between two terms. @@ -261,10 +300,18 @@ def dist_sift4(src, tar, max_offset=5, max_distance=0): >>> dist_sift4('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.3.0 + """ - return Sift4().dist(src, tar, max_offset, max_distance) + return Sift4(max_offset, max_distance).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Sift4.sim method instead.', +) def sim_sift4(src, tar, max_offset=5, max_distance=0): """Return the normalized "common" Sift4 similarity of two terms. @@ -297,8 +344,10 @@ def sim_sift4(src, tar, max_offset=5, max_distance=0): >>> sim_sift4('ATCG', 'TAGC') 0.5 + .. versionadded:: 0.3.0 + """ - return Sift4().sim(src, tar, max_offset, max_distance) + return Sift4(max_offset, max_distance).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_sift4_extended.py b/abydos/distance/_sift4_extended.py new file mode 100644 index 000000000..20b402dcf --- /dev/null +++ b/abydos/distance/_sift4_extended.py @@ -0,0 +1,354 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sift4_extended. + +Sift4 Extended approximate string distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from six.moves import range + +from ._distance import _Distance +from ._sift4 import Sift4 +from ..tokenizer import CharacterTokenizer + +__all__ = ['Sift4Extended'] + + +class Sift4Extended(_Distance): + r"""Sift4 Extended version. + + This is an approximation of edit distance, described in + :cite:`Zackwehdex:2014`. + + .. versionadded:: 0.4.0 + """ + + _sift4 = Sift4() + + def __init__( + self, + max_offset=5, + max_distance=0, + tokenizer=None, + token_matcher=None, + matching_evaluator=None, + local_length_evaluator=None, + transposition_cost_evaluator=None, + transpositions_evaluator=None, + **kwargs + ): + """Initialize Sift4Extended instance. + + Parameters + ---------- + max_offset : int + The number of characters to search for matching letters + max_distance : int + The distance at which to stop and exit + tokenizer : _Tokenizer + A tokenizer instance (character tokenization by default) + token_matcher : function + A token matcher function of two parameters (equality by default). + :math:`Sift4Extended.sift4_token_matcher` is also supplied. + matching_evaluator : function + A token match quality function of two parameters (1 by default). + :math:`Sift4Extended.sift4_matching_evaluator` is also supplied. + local_length_evaluator : function + A local length evaluator function (its single parameter by + default). :math:`Sift4Extended.reward_length_evaluator` and + :math:`Sift4Extended.reward_length_evaluator_exp` are also + supplied. + transposition_cost_evaluator : function + A transposition cost evaluator function of two parameters (1 by + default). + :math:`Sift4Extended.longer_transpositions_are_more_costly` is also + supplied. + transpositions_evaluator : function + A transpositions evaluator function of two parameters (the second + parameter subtracted from the first, by default). + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(_Distance, self).__init__(**kwargs) + self._max_offset = max_offset + self._max_distance = max_distance + self._tokenizer = tokenizer + self._token_matcher = token_matcher + self._matching_evaluator = matching_evaluator + self._local_length_evaluator = local_length_evaluator + self._transposition_cost_evaluator = transposition_cost_evaluator + self._transpositions_evaluator = transpositions_evaluator + + if self._tokenizer is None: + self._tokenizer = CharacterTokenizer() + if self._token_matcher is None: + self._token_matcher = lambda t1, t2: t1 == t2 + if self._matching_evaluator is None: + self._matching_evaluator = lambda t1, t2: 1 + if self._local_length_evaluator is None: + self._local_length_evaluator = lambda local_cs: local_cs + if self._transposition_cost_evaluator is None: + self._transposition_cost_evaluator = lambda c1, c2: 1 + if self._transpositions_evaluator is None: + self._transpositions_evaluator = lambda lcss, trans: lcss - trans + + def dist_abs(self, src, tar): + """Return the Sift4 Extended distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int + The Sift4 distance according to the extended formula + + Examples + -------- + >>> cmp = Sift4Extended() + >>> cmp.dist_abs('cat', 'hat') + 1 + >>> cmp.dist_abs('Niall', 'Neil') + 2 + >>> cmp.dist_abs('aluminum', 'Catalan') + 5 + >>> cmp.dist_abs('ATCG', 'TAGC') + 2 + + + .. versionadded:: 0.4.0 + + """ + src = self._tokenizer.tokenize(src).get_list() + tar = self._tokenizer.tokenize(tar).get_list() + + if not src: + return len(tar) + + if not tar: + return len(src) + + src_len = len(src) + tar_len = len(tar) + + src_cur = 0 + tar_cur = 0 + lcss = 0 + local_cs = 0 + trans = 0 + offset_arr = [] + + while (src_cur < src_len) and (tar_cur < tar_len): + if self._token_matcher(src[src_cur], tar[tar_cur]): + local_cs += self._matching_evaluator( + src[src_cur], tar[tar_cur] + ) + is_trans = False + i = 0 + while i < len(offset_arr): + ofs = offset_arr[i] + if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']: + is_trans = abs(tar_cur - src_cur) >= abs( + ofs['tar_cur'] - ofs['src_cur'] + ) + if is_trans: + trans += self._transposition_cost_evaluator( + src_cur, tar_cur + ) + elif not ofs['trans']: + ofs['trans'] = True + trans += self._transposition_cost_evaluator( + ofs['tar_cur'], ofs['src_cur'] + ) + break + elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']: + del offset_arr[i] + else: + i += 1 + + offset_arr.append( + {'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans} + ) + else: + lcss += self._local_length_evaluator(local_cs) + local_cs = 0 + if src_cur != tar_cur: + src_cur = tar_cur = min(src_cur, tar_cur) + for i in range(self._max_offset): + if not ( + (src_cur + i < src_len) or (tar_cur + i < tar_len) + ): + break + if (src_cur + i < src_len) and ( + self._token_matcher(src[src_cur + i], tar[tar_cur]) + ): + src_cur += i - 1 + tar_cur -= 1 + break + if (tar_cur + i < tar_len) and ( + self._token_matcher(src[src_cur], tar[tar_cur + i]) + ): + src_cur -= 1 + tar_cur += i - 1 + break + + src_cur += 1 + tar_cur += 1 + + if self._max_distance: + temporary_distance = self._local_length_evaluator( + max(src_cur, tar_cur) + ) - self._transpositions_evaluator(lcss, trans) + if temporary_distance >= self._max_distance: + return round(temporary_distance) + + if (src_cur >= src_len) or (tar_cur >= tar_len): + lcss += self._local_length_evaluator(local_cs) + local_cs = 0 + src_cur = tar_cur = min(src_cur, tar_cur) + + lcss += self._local_length_evaluator(local_cs) + return round( + self._local_length_evaluator(max(src_len, tar_len)) + - self._transpositions_evaluator(lcss, trans) + ) + + @staticmethod + def sift4_token_matcher(src, tar): + """Sift4 Token Matcher. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + bool + Whether the Sift4 similarity of the two tokens is over 0.7 + + .. versionadded:: 0.4.0 + + """ + return Sift4Extended.sift4_matching_evaluator(src, tar) > 0.7 + + @staticmethod + def sift4_matching_evaluator(src, tar): + """Sift4 Matching Evaluator. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The Sift4 similarity of the two tokens + + .. versionadded:: 0.4.0 + + """ + return Sift4Extended._sift4.sim(src, tar) + + @staticmethod + def reward_length_evaluator(length): + """Reward Length Evaluator. + + Parameters + ---------- + length : int + The length of a local match + + Returns + ------- + float + A reward value that grows sub-linearly + + .. versionadded:: 0.4.0 + + """ + if length < 1: + return 1 + return length - 1 / (length + 1) + + @staticmethod + def reward_length_evaluator_exp(length): + """Reward Length Evaluator. + + Parameters + ---------- + length : int + The length of a local match + + Returns + ------- + float + A reward value that grows exponentially + + .. versionadded:: 0.4.0 + + """ + return length ** 1.5 + + @staticmethod + def longer_transpositions_are_more_costly(pos1, pos2): + """Longer Transpositions Are More Costly. + + Parameters + ---------- + pos1 : int + The position of the first transposition + pos2 : int + The position of the second transposition + + Returns + ------- + float + A cost that grows as difference in the positions increases + + .. versionadded:: 0.4.0 + + """ + return abs(pos2 - pos1) / 9 + 1 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sift4_simplest.py b/abydos/distance/_sift4_simplest.py index 29720af53..abefd9551 100644 --- a/abydos/distance/_sift4_simplest.py +++ b/abydos/distance/_sift4_simplest.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._sift4 import Sift4 +from .. import __version__ __all__ = ['Sift4Simplest', 'sift4_simplest'] @@ -40,9 +43,28 @@ class Sift4Simplest(Sift4): This is an approximation of edit distance, described in :cite:`Zackwehdex:2014`. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, max_offset=5): + def __init__(self, max_offset=5, **kwargs): + """Initialize Sift4Simplest instance. + + Parameters + ---------- + max_offset : int + The number of characters to search for matching letters + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Sift4Simplest, self).__init__(**kwargs) + self._max_offset = max_offset + + def dist_abs(self, src, tar): """Return the "simplest" Sift4 distance between two terms. Parameters @@ -51,8 +73,6 @@ def dist_abs(self, src, tar, max_offset=5): Source string for comparison tar : str Target string for comparison - max_offset : int - The number of characters to search for matching letters Returns ------- @@ -71,6 +91,11 @@ def dist_abs(self, src, tar, max_offset=5): >>> cmp.dist_abs('ATCG', 'TAGC') 2 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if not src: return len(tar) @@ -94,19 +119,23 @@ def dist_abs(self, src, tar, max_offset=5): local_cs = 0 if src_cur != tar_cur: src_cur = tar_cur = max(src_cur, tar_cur) - for i in range(max_offset): + for i in range(self._max_offset): if not ( (src_cur + i < src_len) or (tar_cur + i < tar_len) ): break - if (src_cur + i < src_len) and ( - src[src_cur + i] == tar[tar_cur] + if ( + (src_cur + i < src_len) + and (tar_cur < tar_len) + and (src[src_cur + i] == tar[tar_cur]) ): src_cur += i local_cs += 1 break - if (tar_cur + i < tar_len) and ( - src[src_cur] == tar[tar_cur + i] + if ( + (tar_cur + i < tar_len) + and (src_cur < src_len) + and (src[src_cur] == tar[tar_cur + i]) ): tar_cur += i local_cs += 1 @@ -119,6 +148,12 @@ def dist_abs(self, src, tar, max_offset=5): return round(max(src_len, tar_len) - lcss) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Sift4Simplest.dist_abs method instead.', +) def sift4_simplest(src, tar, max_offset=5): """Return the "simplest" Sift4 distance between two terms. @@ -149,8 +184,10 @@ def sift4_simplest(src, tar, max_offset=5): >>> sift4_simplest('ATCG', 'TAGC') 2 + .. versionadded:: 0.3.0 + """ - return Sift4Simplest().dist_abs(src, tar, max_offset) + return Sift4Simplest(max_offset).dist_abs(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_single_linkage.py b/abydos/distance/_single_linkage.py new file mode 100644 index 000000000..e2c49c610 --- /dev/null +++ b/abydos/distance/_single_linkage.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._single_linkage. + +single linkage distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._levenshtein import Levenshtein +from ._token_distance import _TokenDistance + +__all__ = ['SingleLinkage'] + + +class SingleLinkage(_TokenDistance): + r"""Single linkage distance. + + For two multisets X and Y, single linkage distance + :cite:`Deza:2016` is + + .. math:: + + dist_{SingleLinkage}(X, Y) = + min_{i \in X, j \in Y} dist(X_i, Y_j) + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, metric=None, **kwargs): + """Initialize SingleLinkage instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. (Defaults to Levenshtein distance) + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(SingleLinkage, self).__init__(tokenizer=tokenizer, **kwargs) + if metric is None: + self._metric = Levenshtein() + else: + self._metric = metric + + def dist_abs(self, src, tar): + """Return the single linkage distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + single linkage distance + + Examples + -------- + >>> cmp = SingleLinkage() + >>> cmp.dist_abs('cat', 'hat') + 0.0 + >>> cmp.dist_abs('Niall', 'Neil') + 0.0 + >>> cmp.dist_abs('aluminum', 'Catalan') + 0.0 + >>> cmp.dist_abs('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + src, tar = self._get_tokens() + + min_val = float('inf') + + for term_src in src.keys(): + for term_tar in tar.keys(): + min_val = min( + min_val, self._metric.dist_abs(term_src, term_tar) + ) + + return float(min_val) + + def dist(self, src, tar): + """Return the normalized single linkage distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + normalized single linkage distance + + Examples + -------- + >>> cmp = SingleLinkage() + >>> cmp.dist('cat', 'hat') + 0.0 + >>> cmp.dist('Niall', 'Neil') + 0.0 + >>> cmp.dist('aluminum', 'Catalan') + 0.0 + >>> cmp.dist('ATCG', 'TAGC') + 0.5 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + src, tar = self._get_tokens() + + min_val = 1.0 + + for term_src in src.keys(): + for term_tar in tar.keys(): + min_val = min(min_val, self._metric.dist(term_src, term_tar)) + + return float(min_val) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_size.py b/abydos/distance/_size.py new file mode 100644 index 000000000..462319a57 --- /dev/null +++ b/abydos/distance/_size.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._size_difference. + +Penrose's size difference +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Size'] + + +class Size(_TokenDistance): + r"""Penrose's size difference. + + For two sets X and Y and a population N, the Penrose's size difference + :cite:`Penrose:1952` is + + .. math:: + + sim_{Size}(X, Y) = + \frac{(|X \triangle Y|)^2}{|N|^2} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Size} = + \frac{(b+c)^2}{n^2} + + In :cite:`IBM:2017`, the formula is instead :math:`\frac{(b-c)^2}{n^2}`, + but it is clear from :cite:`Penrose:1952` that this should not be an + assymmetric value with respect two the ordering of the two sets. Meanwhile, + :cite:`Deza:2016` gives a formula that is equivalent to + :math:`\sqrt{n}\cdot(b+c)`. + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Size instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Size, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist(self, src, tar): + """Return the Penrose's size difference of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Size difference + + Examples + -------- + >>> cmp = Size() + >>> cmp.sim('cat', 'hat') + 0.9999739691795085 + >>> cmp.sim('Niall', 'Neil') + 0.9999202806122449 + >>> cmp.sim('aluminum', 'Catalan') + 0.9996348736257049 + >>> cmp.sim('ATCG', 'TAGC') + 0.9998373073719283 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + return ( + self._symmetric_difference_card() + ) ** 2 / self._population_unique_card() ** 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_smith_waterman.py b/abydos/distance/_smith_waterman.py index 4e5b46e8b..51454419b 100644 --- a/abydos/distance/_smith_waterman.py +++ b/abydos/distance/_smith_waterman.py @@ -28,6 +28,8 @@ unicode_literals, ) +from deprecation import deprecated + from numpy import float32 as np_float32 from numpy import zeros as np_zeros @@ -35,6 +37,7 @@ from ._ident import sim_ident from ._needleman_wunsch import NeedlemanWunsch +from .. import __version__ __all__ = ['SmithWaterman', 'smith_waterman'] @@ -45,22 +48,42 @@ class SmithWaterman(NeedlemanWunsch): The Smith-Waterman score :cite:`Smith:1981` is a standard edit distance measure, differing from Needleman-Wunsch in that it focuses on local alignment and disallows negative scores. + + .. versionadded:: 0.3.6 """ - def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident): - """Return the Smith-Waterman score of two strings. + def __init__(self, gap_cost=1, sim_func=None, **kwargs): + """Initialize SmithWaterman instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison gap_cost : float The cost of an alignment gap (1 by default) sim_func : function A function that returns the similarity of two characters (identity similarity by default) + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(SmithWaterman, self).__init__(**kwargs) + self._gap_cost = gap_cost + self._sim_func = sim_func + if self._sim_func is None: + self._sim_func = NeedlemanWunsch.sim_matrix + + def dist_abs(self, src, tar): + """Return the Smith-Waterman score of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -79,22 +102,31 @@ def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident): >>> cmp.dist_abs('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) - for i in range(len(src) + 1): - d_mat[i, 0] = 0 - for j in range(len(tar) + 1): - d_mat[0, j] = 0 for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): - match = d_mat[i - 1, j - 1] + sim_func(src[i - 1], tar[j - 1]) - delete = d_mat[i - 1, j] - gap_cost - insert = d_mat[i, j - 1] - gap_cost + match = d_mat[i - 1, j - 1] + self._sim_func( + src[i - 1], tar[j - 1] + ) + delete = d_mat[i - 1, j] - self._gap_cost + insert = d_mat[i, j - 1] - self._gap_cost d_mat[i, j] = max(0, match, delete, insert) return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SmithWaterman.dist_abs method instead.', +) def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident): """Return the Smith-Waterman score of two strings. @@ -128,8 +160,10 @@ def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident): >>> smith_waterman('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ - return SmithWaterman().dist_abs(src, tar, gap_cost, sim_func) + return SmithWaterman(gap_cost, sim_func).dist_abs(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_soft_cosine.py b/abydos/distance/_soft_cosine.py new file mode 100644 index 000000000..d8bd147b2 --- /dev/null +++ b/abydos/distance/_soft_cosine.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._softy_cosine. + +Soft Cosine similarity & distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._levenshtein import Levenshtein +from ._token_distance import _TokenDistance + +__all__ = ['SoftCosine'] + + +class SoftCosine(_TokenDistance): + r"""Soft Cosine similarity. + + As described in :cite:`Sidorov:2014`, soft cosine similarity of two + multi-sets X and Y, drawn from an alphabet S, is + + .. math:: + + sim_{soft cosine}(X, Y) = + \frac{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i Y_j} + {\sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i X_j} + \sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} Y_i Y_j}} + + where :math:`s_{ij}` is the similarity of two tokens, by default a function + of Levenshtein distance: :math:`\frac{1}{1+Levenshtein\_distance(i, j)}`. + + Notes + ----- + This class implements soft cosine similarity, as defined by + :cite:`Sidorov:2014`. An alternative formulation of soft cosine similarity + using soft (multi-)sets is provided by the :class:`Cosine` class using + intersection_type=``soft``, based on the soft intersection + defined in :cite:`Russ:2014`. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, tokenizer=None, metric=None, sim_method='a', **kwargs): + r"""Initialize SoftCosine instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` + package, defaulting to the QGrams tokenizer with q=4 + threshold : float + The minimum similarity for a pair of tokens to contribute to + similarity + metric : _Distance + A distance instance from the abydos.distance package, defaulting + to Levenshtein distance + sim_method : str + Selects the similarity method from the four given in + :cite:`Sidorov:2014`: + + - ``a`` : :math:`\frac{1}{1+d}` + - ``b`` : :math:`1-\frac{d}{m}` + - ``c`` : :math:`\sqrt{1-\frac{d}{m}}` + - ``d`` : :math:`\Big(1-\frac{d}{m}\Big)^2` + + Where :math:`d` is the distance (Levenshtein by default) and + :math:`m` is the maximum length of the two tokens. Option `a` is + default, as suggested by the paper. + **kwargs + Arbitrary keyword arguments + + Raises + ------ + ValueError + sim_method must be one of 'a', 'b', 'c', or 'd' + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(SoftCosine, self).__init__(tokenizer, **kwargs) + self.params['metric'] = metric if metric is not None else Levenshtein() + if sim_method not in 'abcd': + raise ValueError("sim_method must be one of 'a', 'b', 'c', or 'd'") + self.params['sim_method'] = sim_method + + def sim(self, src, tar): + r"""Return the Soft Cosine similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Fuzzy Cosine similarity + + Examples + -------- + >>> cmp = SoftCosine() + >>> cmp.sim('cat', 'hat') + 0.8750000000000001 + >>> cmp.sim('Niall', 'Neil') + 0.8844691709074513 + >>> cmp.sim('aluminum', 'Catalan') + 0.831348688760277 + >>> cmp.sim('ATCG', 'TAGC') + 0.8571428571428572 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + if not self._src_card() or not self._tar_card(): + return 0.0 + + similarity = { + 'a': lambda src, tar: 1 + / (1 + self.params['metric'].dist_abs(src, tar)), + 'b': lambda src, tar: 1 + - ( + self.params['metric'].dist_abs(src, tar) + / max(len(src), len(tar)) + ), + 'c': lambda src, tar: ( + 1 + - ( + self.params['metric'].dist_abs(src, tar) + / max(len(src), len(tar)) + ) + ) + ** 0.5, + 'd': lambda src, tar: ( + 1 + - ( + self.params['metric'].dist_abs(src, tar) + / max(len(src), len(tar)) + ) + ) + ** 2, + } + + nom = 0 + denom_left = 0 + denom_right = 0 + + for src in self._src_tokens.keys(): + for tar in self._tar_tokens.keys(): + nom += ( + self._src_tokens[src] + * self._tar_tokens[tar] + * similarity[self.params['sim_method']](src, tar) + ) + + for src in self._src_tokens.keys(): + for tar in self._src_tokens.keys(): + denom_left += ( + self._src_tokens[src] + * self._src_tokens[tar] + * similarity[self.params['sim_method']](src, tar) + ) + + for src in self._tar_tokens.keys(): + for tar in self._tar_tokens.keys(): + denom_right += ( + self._tar_tokens[src] + * self._tar_tokens[tar] + * similarity[self.params['sim_method']](src, tar) + ) + + return nom / (denom_left ** 0.5 * denom_right ** 0.5) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_softtf_idf.py b/abydos/distance/_softtf_idf.py new file mode 100644 index 000000000..e1eed208d --- /dev/null +++ b/abydos/distance/_softtf_idf.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._softtf_idf. + +SoftTF-IDF similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import Counter +from math import log1p + +from ._jaro_winkler import JaroWinkler +from ._token_distance import _TokenDistance +from ..corpus import UnigramCorpus + +__all__ = ['SoftTFIDF'] + + +class SoftTFIDF(_TokenDistance): + r"""SoftTF-IDF similarity. + + For two sets X and Y and a population N, SoftTF-IDF similarity + :cite:`Cohen:2003` is + + .. math:: + + \begin{array}{ll} + sim_{SoftTF-IDF}(X, Y) &= \sum_{w \in \{sim_{metric}(x, y) \ge + \theta | x \in X, y \in Y \}} V(w, S) \cdot V(w, X) \cdot V(w, Y) + \\ + \\ + V(w, S) &= \frac{V'(w, S)}{\sqrt{\sum_{w \in S} V'(w, S)^2}} + \\ + \\ + V'(w, S) &= log(1+TF_{w,S}) \cdot log(1+IDF_w) + \end{array} + + Notes + ----- + One is added to both the TF & IDF values before taking the logarithm to + ensure the logarithms do not fall to 0, which will tend to result in 0.0 + similarities even when there is a degree of matching. + + Rather than needing to exceed the threshold value, as in :cite:`Cohen:2003` + the similarity must be greater than or equal to the threshold. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, tokenizer=None, corpus=None, metric=None, threshold=0.9, **kwargs + ): + """Initialize SoftTFIDF instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + corpus : UnigramCorpus + A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will + be created from the two words when a similarity function is called. + metric : _Distance + A string distance measure class for making soft matches, by default + Jaro-Winkler. + threshold : float + A threshold value, similarities above which are counted as + soft matches, by default 0.9. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(SoftTFIDF, self).__init__(tokenizer=tokenizer, **kwargs) + self._corpus = corpus + self._metric = metric + self._threshold = threshold + + if self._metric is None: + self._metric = JaroWinkler() + + def sim(self, src, tar): + """Return the SoftTF-IDF similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + SoftTF-IDF similarity + + Examples + -------- + >>> cmp = SoftTFIDF() + >>> cmp.sim('cat', 'hat') + 0.30404449697373 + >>> cmp.sim('Niall', 'Neil') + 0.20108911303601 + >>> cmp.sim('aluminum', 'Catalan') + 0.05355175631194 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + src_tok, tar_tok = self._get_tokens() + + if self._corpus is None: + corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer']) + corpus.add_document(src) + corpus.add_document(tar) + else: + corpus = self._corpus + + matches = {(tok, tok): 1.0 for tok in self._crisp_intersection()} + sims = Counter() + s_toks = set(self._src_only().keys()) + t_toks = set(self._tar_only().keys()) + for s_tok in s_toks: + for t_tok in t_toks: + sim = self._metric.sim(s_tok, t_tok) + if sim > self._threshold: + sims[(s_tok, t_tok)] = sim + for tokens, value in sims.most_common(): + if tokens[0] in s_toks and tokens[1] in t_toks: + matches[tokens] = value + s_toks.remove(tokens[0]) + t_toks.remove(tokens[1]) + + vws_dict = {} + vwt_dict = {} + for token in src_tok.keys(): + vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token) + for token in tar_tok.keys(): + vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token) + + vws_rss = sum(score ** 2 for score in vws_dict.values()) ** 0.5 + vwt_rss = sum(score ** 2 for score in vwt_dict.values()) ** 0.5 + + return float( + round( + sum( + vws_dict[s_tok] + / vws_rss + * vwt_dict[t_tok] + / vwt_rss + * matches[(s_tok, t_tok)] + for s_tok, t_tok in matches.keys() + ), + 14, + ) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sokal_michener.py b/abydos/distance/_sokal_michener.py new file mode 100644 index 000000000..cb78e6aa3 --- /dev/null +++ b/abydos/distance/_sokal_michener.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sokal_michener. + +Sokal & Michener similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['SokalMichener'] + + +class SokalMichener(_TokenDistance): + r"""Sokal & Michener similarity. + + For two sets X and Y and a population N, the Sokal & Michener's + simple matching coefficient :cite:`Sokal:1958`, equivalent to the Rand + index :cite:`Rand:1971` is + + .. math:: + + sim_{SokalMichener}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{SokalMichener} = + \frac{a+d}{n} + + Notes + ----- + The associated distance metric is the mean Manhattan distance and 4 times + the value of the variance dissimilarity of :cite:`IBM:2017`. + + In terms of a confusion matrix, this is equivalent to accuracy + :py:meth:`ConfusionTable.accuracy`. + + .. versionadded:: 0.4.0 + + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize SokalMichener instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(SokalMichener, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Sokal & Michener similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Sokal & Michener similarity + + Examples + -------- + >>> cmp = SokalMichener() + >>> cmp.sim('cat', 'hat') + 0.9948979591836735 + >>> cmp.sim('Niall', 'Neil') + 0.9910714285714286 + >>> cmp.sim('aluminum', 'Catalan') + 0.9808917197452229 + >>> cmp.sim('ATCG', 'TAGC') + 0.9872448979591837 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return (a + d) / n + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sokal_sneath_i.py b/abydos/distance/_sokal_sneath_i.py new file mode 100644 index 000000000..8194f5474 --- /dev/null +++ b/abydos/distance/_sokal_sneath_i.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sokal_sneath_i. + +Sokal & Sneath I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['SokalSneathI'] + + +class SokalSneathI(_TokenDistance): + r"""Sokal & Sneath I similarity. + + For two sets X and Y and a population N, Sokal & Sneath I similarity + :cite:`Sokal:1963` is + + .. math:: + + sim_{SokalSneathI}(X, Y) = + \frac{2(|X \cap Y| + |(N \setminus X) \setminus Y|)} + {|X \cap Y| + |(N \setminus X) \setminus Y| + |N|} + + This is the first of five "Unnamed coefficients" presented in + :cite:`Sokal:1963`. It corresponds to the "Matched pairs carry twice the + weight of unmatched pairs in the Denominator" with "Negative Matches in + Numerator Included". + "Negative Matches in Numerator Excluded" corresponds to the Sørensen–Dice + coefficient, :class:`.Dice`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{SokalSneathI} = + \frac{2(a+d)}{a+d+n} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize SokalSneathI instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(SokalSneathI, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Sokal & Sneath I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Sokal & Sneath I similarity + + Examples + -------- + >>> cmp = SokalSneathI() + >>> cmp.sim('cat', 'hat') + 0.9974424552429667 + >>> cmp.sim('Niall', 'Neil') + 0.9955156950672646 + >>> cmp.sim('aluminum', 'Catalan') + 0.9903536977491961 + >>> cmp.sim('ATCG', 'TAGC') + 0.993581514762516 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return (2 * (a + d)) / (a + d + n) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sokal_sneath_ii.py b/abydos/distance/_sokal_sneath_ii.py new file mode 100644 index 000000000..5dc2f3758 --- /dev/null +++ b/abydos/distance/_sokal_sneath_ii.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sokal_sneath_ii. + +Sokal & Sneath II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['SokalSneathII'] + + +class SokalSneathII(_TokenDistance): + r"""Sokal & Sneath II similarity. + + For two sets X and Y, Sokal & Sneath II similarity :cite:`Sokal:1963` is + + .. math:: + + sim_{SokalSneathII}(X, Y) = + \frac{|X \cap Y|} + {|X \cap Y| + 2|X \triangle Y|} + + This is the second of five "Unnamed coefficients" presented in + :cite:`Sokal:1963`. It corresponds to the "Unmatched pairs carry twice the + weight of matched pairs in the Denominator" with "Negative Matches in + Numerator Excluded". + "Negative Matches in Numerator Included" corresponds to the Rogers & + Tanimoto similarity, :class:`.RogersTanimoto`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{SokalSneathII} = + \frac{a}{a+2(b+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize SokalSneathII instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(SokalSneathII, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Sokal & Sneath II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Sokal & Sneath II similarity + + Examples + -------- + >>> cmp = SokalSneathII() + >>> cmp.sim('cat', 'hat') + 0.2 + >>> cmp.sim('Niall', 'Neil') + 0.125 + >>> cmp.sim('aluminum', 'Catalan') + 0.03225806451612903 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return a / (a + 2 * (b + c)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sokal_sneath_iii.py b/abydos/distance/_sokal_sneath_iii.py new file mode 100644 index 000000000..ade297ddf --- /dev/null +++ b/abydos/distance/_sokal_sneath_iii.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sokal_sneath_iii. + +Sokal & Sneath III similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['SokalSneathIII'] + + +class SokalSneathIII(_TokenDistance): + r"""Sokal & Sneath III similarity. + + For two sets X and Y and a population N, Sokal & Sneath III similarity + :cite:`Sokal:1963` is + + .. math:: + + sim_{SokalSneathIII}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y|} + {|X \triangle Y|} + + This is the third of five "Unnamed coefficients" presented in + :cite:`Sokal:1963`. It corresponds to the "Unmatched pairs only in the + Denominator" with "Negative Matches in Numerator Excluded". + "Negative Matches in Numerator Included" corresponds to the Kulczynski I + coefficient, :class:`.KulczynskiI`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{SokalSneathIII} = + \frac{a+d}{b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize SokalSneathIII instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(SokalSneathIII, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim_score(self, src, tar): + """Return the Sokal & Sneath III similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Sokal & Sneath III similarity + + Examples + -------- + >>> cmp = SokalSneathIII() + >>> cmp.sim_score('cat', 'hat') + 195.0 + >>> cmp.sim_score('Niall', 'Neil') + 111.0 + >>> cmp.sim_score('aluminum', 'Catalan') + 51.333333333333336 + >>> cmp.sim_score('ATCG', 'TAGC') + 77.4 + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return float('inf') + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + return (a + d) / (b + c) + + def sim(self, *args, **kwargs): + """Raise exception when called. + + Parameters + ---------- + *args + Variable length argument list + **kwargs + Arbitrary keyword arguments + + Raises + ------ + NotImplementedError + Method disabled for Sokal & Sneath III similarity. + + + .. versionadded:: 0.3.6 + + """ + raise NotImplementedError( + 'Method disabled for Sokal & Sneath III similarity.' + ) + + def dist(self, *args, **kwargs): + """Raise exception when called. + + Parameters + ---------- + *args + Variable length argument list + **kwargs + Arbitrary keyword arguments + + Raises + ------ + NotImplementedError + Method disabled for Sokal & Sneath III similarity. + + + .. versionadded:: 0.3.6 + + """ + raise NotImplementedError( + 'Method disabled for Sokal & Sneath III similarity.' + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sokal_sneath_iv.py b/abydos/distance/_sokal_sneath_iv.py new file mode 100644 index 000000000..174f0f416 --- /dev/null +++ b/abydos/distance/_sokal_sneath_iv.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sokal_sneath_iv. + +Sokal & Sneath IV similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['SokalSneathIV'] + + +class SokalSneathIV(_TokenDistance): + r"""Sokal & Sneath IV similarity. + + For two sets X and Y and a population N, Sokal & Sneath IV similarity + :cite:`Sokal:1963` is + + .. math:: + + sim_{SokalSneathIV}(X, Y) = + \frac{1}{4}\Bigg( + \frac{|X \cap Y|}{|X|}+ + \frac{|X \cap Y|}{|Y|}+ + \frac{|(N \setminus X) \setminus Y|} + {|N \setminus Y|}+ + \frac{|(N \setminus X) \setminus Y|} + {|N \setminus X|} + \Bigg) + + This is the fourth of five "Unnamed coefficients" presented in + :cite:`Sokal:1963`. It corresponds to the first "Marginal totals in the + Denominator" with "Negative Matches in Numerator Included". + "Negative Matches in Numerator Excluded" corresponds to the Kulczynski II + similarity, :class:`.KulczynskiII`. This is also Rogot & Goldberg's + "adjusted agreement" :math:`A_1` :cite:`Rogot:1966`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{SokalSneathIV} = + \frac{1}{4}\Big(\frac{a}{a+b}+\frac{a}{a+c}+ + \frac{d}{b+d}+\frac{d}{c+d}\Big) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize SokalSneathIV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(SokalSneathIV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Sokal & Sneath IV similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Sokal & Sneath IV similarity + + Examples + -------- + >>> cmp = SokalSneathIV() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6810856260030602 + >>> cmp.sim('aluminum', 'Catalan') + 0.5541986205645999 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + a_part = 0 if a == 0 else (a / (a + b) + a / (a + c)) + d_part = 0 if d == 0 else (d / (b + d) + d / (c + d)) + + return 0.25 * (a_part + d_part) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sokal_sneath_v.py b/abydos/distance/_sokal_sneath_v.py new file mode 100644 index 000000000..47b3d75cf --- /dev/null +++ b/abydos/distance/_sokal_sneath_v.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sokal_sneath_v. + +Sokal & Sneath V similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['SokalSneathV'] + + +class SokalSneathV(_TokenDistance): + r"""Sokal & Sneath V similarity. + + For two sets X and Y and a population N, Sokal & Sneath V similarity + :cite:`Sokal:1963` is + + .. math:: + + sim_{SokalSneathV}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + {\sqrt{|X| \cdot |Y| \cdot |N \setminus Y| \cdot |N \setminus X|}} + + This is the fifth of five "Unnamed coefficients" presented in + :cite:`Sokal:1963`. It corresponds to the second "Marginal totals in the + Denominator" with "Negative Matches in Numerator Included", also sometimes + referred to as Ochiai II similarity. + "Negative Matches in Numerator Excluded" corresponds to the Cosine + similarity, :class:`.Cosine`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{SokalSneathV} = + \frac{ad}{\sqrt{(a+b)(a+c)(b+d)(c+d)}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize SokalSneathV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(SokalSneathV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Sokal & Sneath V similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Sokal & Sneath V similarity + + Examples + -------- + >>> cmp = SokalSneathV() + >>> cmp.sim('cat', 'hat') + 0.4987179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.3635068033537323 + >>> cmp.sim('aluminum', 'Catalan') + 0.11671286273067434 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d + if num: + return num / ((a + b) * (a + c) * (b + d) * (c + d)) ** 0.5 + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_sorgenfrei.py b/abydos/distance/_sorgenfrei.py new file mode 100644 index 000000000..82cd139ba --- /dev/null +++ b/abydos/distance/_sorgenfrei.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._sorgenfrei. + +Sorgenfrei similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Sorgenfrei'] + + +class Sorgenfrei(_TokenDistance): + r"""Sorgenfrei similarity. + + For two sets X and Y, Sorgenfrei similarity :cite:`Sorgenfrei:1958` is + + .. math:: + + sim_{Sorgenfrei}(X, Y) = + \frac{|X \cap Y|^2}{|X| \cdot |Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Sorgenfrei} = + \frac{a^2}{(a+b)(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Sorgenfrei instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Sorgenfrei, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Sorgenfrei similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Sorgenfrei similarity + + Examples + -------- + >>> cmp = Sorgenfrei() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.13333333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.013888888888888888 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + apb = self._src_card() + apc = self._tar_card() + + return a ** 2 / (apb * apc) if a else 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_steffensen.py b/abydos/distance/_steffensen.py new file mode 100644 index 000000000..bd332383a --- /dev/null +++ b/abydos/distance/_steffensen.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._steffensen. + +Steffensen similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from numpy import array as np_array + +from ._token_distance import _TokenDistance + +__all__ = ['Steffensen'] + + +class Steffensen(_TokenDistance): + r"""Steffensen similarity. + + For two sets X and Y and a population N, Steffensen similarity + :math:`\psi^2` :cite:`Steffensen:1934` is + + .. math:: + + \begin{array}{ll} + sim_{Steffensen_{\psi}}(X, Y) = \psi^2 &= + \sum_{i \in X}\sum_{j \in Y} p_{ij} \phi_{ij}^2 + \\ + \phi_{ij}^2 &= \frac{(p_{ij} - p_{i*}p_{*i})^2} + {p_{i*}(1-p_{i*})p_{*j}(1-p_{*j})} + \end{array} + + Where each value :math:`p_{ij}` is drawn from the 2x2 contingency table: + + +----------------+------------------+-------------------+---------+ + | | |s_in| ``tar`` | |s_notin| ``tar`` | | + +----------------+------------------+-------------------+---------+ + | |s_in| ``src`` | |s_a| | |s_b| | |s_a+b| | + +----------------+------------------+-------------------+---------+ + | |s_notin| ``src``| |s_c| | |s_d| | |s_c+d| | + +----------------+------------------+-------------------+---------+ + | | |s_a+c| | |s_b+d| | |s_n| | + +----------------+------------------+-------------------+---------+ + + .. |s_in| replace:: :math:`x \in` + .. |s_notin| replace:: :math:`x \notin` + + .. |s_a| replace:: :math:`p_{11} = a` + .. |s_b| replace:: :math:`p_{10} = b` + .. |s_c| replace:: :math:`p_{01} = c` + .. |s_d| replace:: :math:`p_{00} = d` + .. |s_n| replace:: :math:`1` + .. |s_a+b| replace:: :math:`p_{1*} = a+b` + .. |s_a+c| replace:: :math:`p_{*1} = a+c` + .. |s_c+d| replace:: :math:`p_{0*} = c+d` + .. |s_b+d| replace:: :math:`p_{*0} = b+d` + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + normalizer='proportional', + **kwargs + ): + """Initialize Steffensen instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + normalizer : str + Specifies the normalization type. See :ref:`normalizer ` + description in :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Steffensen, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + normalizer=normalizer, + **kwargs + ) + + def sim(self, src, tar): + """Return the Steffensen similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Steffensen similarity + + Examples + -------- + >>> cmp = Steffensen() + >>> cmp.sim('cat', 'hat') + 0.24744247205786737 + >>> cmp.sim('Niall', 'Neil') + 0.1300991207720166 + >>> cmp.sim('aluminum', 'Catalan') + 0.011710186806836031 + >>> cmp.sim('ATCG', 'TAGC') + 4.1196952743871653e-05 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = a + b + c + d + + p = np_array([[a, b], [c, d]]) / n + + psisq = 0.0 + + for i in range(len(p)): + pi_star = p[i, :].sum() + for j in range(len(p[i])): + pj_star = p[:, j].sum() + num = p[i, j] * (p[i, j] - pi_star * pj_star) ** 2 + if num: + psisq += num / ( + pi_star * (1 - pi_star) * pj_star * (1 - pj_star) + ) + + return psisq + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_stiles.py b/abydos/distance/_stiles.py new file mode 100644 index 000000000..ce6a9e7ab --- /dev/null +++ b/abydos/distance/_stiles.py @@ -0,0 +1,236 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._stiles. + +Stiles similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import copysign, log10 + +from ._token_distance import _TokenDistance + +__all__ = ['Stiles'] + + +class Stiles(_TokenDistance): + r"""Stiles similarity. + + For two sets X and Y and a population N, Stiles similarity + :cite:`Stiles:1961` is + + .. math:: + + sim_{Stiles}(X, Y) = log_{10} + \frac{|N| \Big(||X \cap Y| \cdot + |N| - + |X \setminus Y| \cdot |Y \setminus X|| - + \frac{|N|}{2}\Big)^2} + {|X \setminus Y| \cdot |Y \setminus X| \cdot + (|N| - |X \setminus Y|) \cdot + (|N| - |Y \setminus X|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Stiles} = + log_{10} \frac{n(|an-bc|-\frac{1}{2}n)^2}{bc(n-b)(n-c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Stiles instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Stiles, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Stiles similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Stiles similarity + + Examples + -------- + >>> cmp = Stiles() + >>> cmp.sim_score('cat', 'hat') + 2.6436977886009236 + >>> cmp.sim_score('Niall', 'Neil') + 2.1622951406967723 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.41925115106844024 + >>> cmp.sim_score('ATCG', 'TAGC') + -0.8426334527850912 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + eps = 0.0000001 + a = max(self._intersection_card(), eps) + b = max(self._src_only_card(), eps) + c = max(self._tar_only_card(), eps) + n = max(self._total_complement_card(), eps) + a + b + c + + anmbc = a * n - b * c + + return copysign( + log10( + n + * max((abs(anmbc) - n / 2) ** 2, eps) + / (b * (n - b) * c * (n - c)) + ), + anmbc, + ) + + def corr(self, src, tar): + """Return the Stiles correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Stiles correlation + + Examples + -------- + >>> cmp = Stiles() + >>> cmp.corr('cat', 'hat') + 0.14701542182970487 + >>> cmp.corr('Niall', 'Neil') + 0.11767566062554877 + >>> cmp.corr('aluminum', 'Catalan') + 0.022355640924908403 + >>> cmp.corr('ATCG', 'TAGC') + -0.046296656196428934 + + + .. versionadded:: 0.4.0 + + """ + return self.sim_score(src, tar) / max( + self.sim_score(src, src), self.sim_score(tar, tar) + ) + + def sim(self, src, tar): + """Return the normalized Stiles similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Stiles similarity + + Examples + -------- + >>> cmp = Stiles() + >>> cmp.sim('cat', 'hat') + 0.5735077109148524 + >>> cmp.sim('Niall', 'Neil') + 0.5588378303127743 + >>> cmp.sim('aluminum', 'Catalan') + 0.5111778204624542 + >>> cmp.sim('ATCG', 'TAGC') + 0.4768516719017855 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_strcmp95.py b/abydos/distance/_strcmp95.py index ed5013cfd..8ee3f2152 100644 --- a/abydos/distance/_strcmp95.py +++ b/abydos/distance/_strcmp95.py @@ -30,9 +30,12 @@ from collections import defaultdict +from deprecation import deprecated + from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = ['Strcmp95', 'dist_strcmp95', 'sim_strcmp95'] @@ -50,6 +53,8 @@ class Strcmp95(_Distance): for some common typos and frequently confused characters. It is also limited to uppercase ASCII characters, so it is appropriate to American names, but not much else. + + .. versionadded:: 0.3.6 """ _sp_mx = ( @@ -91,21 +96,36 @@ class Strcmp95(_Distance): ('G', 'J'), ) - def sim(self, src, tar, long_strings=False): - """Return the strcmp95 similarity of two strings. + def __init__(self, long_strings=False, **kwargs): + """Initialize Strcmp95 instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison long_strings : bool Set to True to increase the probability of a match when the number of matched characters is large. This option allows for a little more tolerance when the strings are large. It is not an appropriate test when comparing fixed length fields such as phone and social security numbers. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Strcmp95, self).__init__(**kwargs) + self._long_strings = long_strings + + def sim(self, src, tar): + """Return the strcmp95 similarity of two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -124,6 +144,11 @@ def sim(self, src, tar, long_strings=False): >>> cmp.sim('ATCG', 'TAGC') 0.8333333333333334 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _in_range(char): @@ -139,6 +164,8 @@ def _in_range(char): bool True if char is in the range (0, 91) + .. versionadded:: 0.1.0 + """ return 91 > ord(char) > 0 @@ -241,7 +268,7 @@ def _in_range(char): # After agreeing beginning chars, at least two more must agree and # the agreeing characters must be > .5 of remaining characters. if ( - long_strings + self._long_strings and (minv > 4) and (num_com > i + 1) and (2 * num_com >= minv + i) @@ -254,6 +281,12 @@ def _in_range(char): return weight +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Strcmp95.sim method instead.', +) def sim_strcmp95(src, tar, long_strings=False): """Return the strcmp95 similarity of two strings. @@ -288,10 +321,18 @@ def sim_strcmp95(src, tar, long_strings=False): >>> sim_strcmp95('ATCG', 'TAGC') 0.8333333333333334 + .. versionadded:: 0.1.0 + """ - return Strcmp95().sim(src, tar, long_strings) + return Strcmp95(long_strings).sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Strcmp95.dist method instead.', +) def dist_strcmp95(src, tar, long_strings=False): """Return the strcmp95 distance between two strings. @@ -326,8 +367,10 @@ def dist_strcmp95(src, tar, long_strings=False): >>> round(dist_strcmp95('ATCG', 'TAGC'), 12) 0.166666666667 + .. versionadded:: 0.1.0 + """ - return Strcmp95().dist(src, tar, long_strings) + return Strcmp95(long_strings).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_stuart_tau.py b/abydos/distance/_stuart_tau.py new file mode 100644 index 000000000..6c4074165 --- /dev/null +++ b/abydos/distance/_stuart_tau.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._stuart_tau. + +Stuart's Tau correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['StuartTau'] + + +class StuartTau(_TokenDistance): + r"""Stuart's Tau correlation. + + For two sets X and Y and a population N, Stuart's Tau-C correlation + :cite:`Stuart:1953` is + + .. math:: + + corr_{Stuart_{\tau_c}}(X, Y) = + \frac{4 \cdot (|X \cap Y| + |(N \setminus X) \setminus Y| - + |X \triangle Y|)}{|N|^2} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Stuart_{\tau_c}} = + \frac{4 \cdot ((a+d)-(b+c))}{n^2} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize StuartTau instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(StuartTau, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Stuart's Tau correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Stuart's Tau correlation + + Examples + -------- + >>> cmp = StuartTau() + >>> cmp.corr('cat', 'hat') + 0.005049979175343606 + >>> cmp.corr('Niall', 'Neil') + 0.005010932944606414 + >>> cmp.corr('aluminum', 'Catalan') + 0.004900807334983164 + >>> cmp.corr('ATCG', 'TAGC') + 0.0049718867138692216 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + if not n: + return 1.0 + return max(-1.0, min(1.0, 4 * (a + d - b - c) / (n ** 2))) + + def sim(self, src, tar): + """Return the Stuart's Tau similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Stuart's Tau similarity + + Examples + -------- + >>> cmp = StuartTau() + >>> cmp.sim('cat', 'hat') + 0.5025249895876718 + >>> cmp.sim('Niall', 'Neil') + 0.5025054664723032 + >>> cmp.sim('aluminum', 'Catalan') + 0.5024504036674916 + >>> cmp.sim('ATCG', 'TAGC') + 0.5024859433569346 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_suffix.py b/abydos/distance/_suffix.py index 2880a5423..9b9b7bbbd 100644 --- a/abydos/distance/_suffix.py +++ b/abydos/distance/_suffix.py @@ -28,15 +28,21 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._distance import _Distance +from .. import __version__ __all__ = ['Suffix', 'dist_suffix', 'sim_suffix'] class Suffix(_Distance): - """Suffix similarity and distance.""" + """Suffix similarity and distance. + + .. versionadded:: 0.3.6 + """ def sim(self, src, tar): """Return the suffix similarity of two strings. @@ -69,6 +75,11 @@ def sim(self, src, tar): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 1.0 @@ -82,6 +93,12 @@ def sim(self, src, tar): return 0.0 +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Suffix.sim method instead.', +) def sim_suffix(src, tar): """Return the suffix similarity of two strings. @@ -110,10 +127,18 @@ def sim_suffix(src, tar): >>> sim_suffix('ATCG', 'TAGC') 0.0 + .. versionadded:: 0.1.0 + """ return Suffix().sim(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Suffix.dist method instead.', +) def dist_suffix(src, tar): """Return the suffix distance between two strings. @@ -142,6 +167,8 @@ def dist_suffix(src, tar): >>> dist_suffix('ATCG', 'TAGC') 1.0 + .. versionadded:: 0.1.0 + """ return Suffix().dist(src, tar) diff --git a/abydos/distance/_synoname.py b/abydos/distance/_synoname.py index 73176ac60..037e44b06 100644 --- a/abydos/distance/_synoname.py +++ b/abydos/distance/_synoname.py @@ -30,12 +30,15 @@ from collections import Iterable +from deprecation import deprecated + from ._distance import _Distance from ._levenshtein import levenshtein from ._ratcliff_obershelp import sim_ratcliff_obershelp +from .. import __version__ # noinspection PyProtectedMember -from ..fingerprint._synoname import SynonameToolcode +from ..fingerprint._synoname_toolcode import SynonameToolcode __all__ = ['Synoname', 'synoname'] @@ -44,6 +47,8 @@ class Synoname(_Distance): """Synoname. Cf. :cite:`Getty:1991,Gross:1991` + + .. versionadded:: 0.3.6 """ _stc = SynonameToolcode() @@ -104,6 +109,11 @@ def _synoname_strip_punct(self, word): >>> pe._synoname_strip_punct('AB;CD EF-GH$IJ') 'ABCD EFGHIJ' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ stripped = '' for char in word: @@ -142,6 +152,11 @@ def _synoname_word_approximation( ... 'Tom Joe Bob', 'Tom Joe') 0.6 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if features is None: features = {} @@ -422,23 +437,18 @@ def _synoname_word_approximation( return 0 - def dist_abs( + def __init__( self, - src, - tar, word_approx_min=0.3, char_approx_min=0.73, tests=2 ** 12 - 1, ret_name=False, + **kwargs ): - """Return the Synoname similarity type of two words. + """Initialize Synoname instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison word_approx_min : float The minimum word approximation value to signal a 'word_approx' match @@ -450,6 +460,37 @@ def dist_abs( names to perform (defaults to performing all tests) ret_name : bool If True, returns the match name rather than its integer equivalent + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Synoname, self).__init__(**kwargs) + self._word_approx_min = word_approx_min + self._char_approx_min = char_approx_min + self._ret_name = ret_name + + self._tests = tests + if isinstance(self._tests, Iterable): + new_tests = 0 + for term in self._tests: + if term in self._test_dict: + new_tests += self._test_dict[term] + self._tests = new_tests + + def dist_abs(self, src, tar, force_numeric=False): + """Return the Synoname similarity type of two words. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + force_numeric : bool + Overrides the instance's ret_name setting Returns ------- @@ -461,24 +502,22 @@ def dist_abs( >>> cmp = Synoname() >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', '')) 2 - >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''), - ... ret_name=True) + + >>> cmp = Synoname(ret_name=True) + >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', '')) 'omission' >>> cmp.dist_abs(('Dore', 'Gustave', ''), - ... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True) + ... ('Dore', 'Paul Gustave Louis Christophe', '')) 'inclusion' - >>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''), - ... ret_name=True) + >>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', '')) 'word_approx' - """ - if isinstance(tests, Iterable): - new_tests = 0 - for term in tests: - if term in self._test_dict: - new_tests += self._test_dict[term] - tests = new_tests + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ if isinstance(src, tuple): src_ln, src_fn, src_qual = src elif '#' in src: @@ -501,7 +540,7 @@ def _split_special(spec): return spec_list def _fmt_retval(val): - if ret_name: + if self._ret_name and not force_numeric: return self._match_name[val] return val @@ -578,13 +617,13 @@ def _approx_c(): full_tar = full_tar[len(intro) :] loc_ratio = sim_ratcliff_obershelp(full_src, full_tar) - return loc_ratio >= char_approx_min, loc_ratio + return loc_ratio >= self._char_approx_min, loc_ratio approx_c_result, ca_ratio = _approx_c() - if tests & self._test_dict['exact'] and fn_equal and ln_equal: + if self._tests & self._test_dict['exact'] and fn_equal and ln_equal: return _fmt_retval(self._match_type_dict['exact']) - if tests & self._test_dict['omission']: + if self._tests & self._test_dict['omission']: if ( fn_equal and levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1 @@ -596,7 +635,7 @@ def _approx_c(): and levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1 ): return _fmt_retval(self._match_type_dict['omission']) - if tests & self._test_dict['substitution']: + if self._tests & self._test_dict['substitution']: if ( fn_equal and levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1 @@ -607,7 +646,7 @@ def _approx_c(): and levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1 ): return _fmt_retval(self._match_type_dict['substitution']) - if tests & self._test_dict['transposition']: + if self._tests & self._test_dict['transposition']: if fn_equal and ( levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1)) == 1 @@ -618,7 +657,7 @@ def _approx_c(): == 1 ): return _fmt_retval(self._match_type_dict['transposition']) - if tests & self._test_dict['punctuation']: + if self._tests & self._test_dict['punctuation']: np_src_fn = self._synoname_strip_punct(src_fn) np_tar_fn = self._synoname_strip_punct(tar_fn) np_src_ln = self._synoname_strip_punct(src_ln) @@ -635,7 +674,7 @@ def _approx_c(): if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln): return _fmt_retval(self._match_type_dict['punctuation']) - if tests & self._test_dict['initials'] and ln_equal: + if self._tests & self._test_dict['initials'] and ln_equal: if src_fn and tar_fn: src_initials = self._synoname_strip_punct(src_fn).split() tar_initials = self._synoname_strip_punct(tar_fn).split() @@ -668,8 +707,8 @@ def _approx_c(): ) ): return _fmt_retval(self._match_type_dict['initials']) - if tests & self._test_dict['extension']: - if src_ln[1] == tar_ln[1] and ( + if self._tests & self._test_dict['extension']: + if src_ln[1:2] == tar_ln[1:2] and ( src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln) ): if ( @@ -678,13 +717,13 @@ def _approx_c(): or (src_fn and tar_fn.startswith(src_fn)) ) and not roman_conflict: return _fmt_retval(self._match_type_dict['extension']) - if tests & self._test_dict['inclusion'] and ln_equal: + if self._tests & self._test_dict['inclusion'] and ln_equal: if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln): return _fmt_retval(self._match_type_dict['inclusion']) - if tests & self._test_dict['no_first'] and ln_equal: + if self._tests & self._test_dict['no_first'] and ln_equal: if src_fn == '' or tar_fn == '': return _fmt_retval(self._match_type_dict['no_first']) - if tests & self._test_dict['word_approx']: + if self._tests & self._test_dict['word_approx']: ratio = self._synoname_word_approximation( src_ln, tar_ln, @@ -697,27 +736,20 @@ def _approx_c(): 'tar_specials': tar_specials, }, ) - if ratio == 1 and tests & self._test_dict['confusions']: + if ratio == 1 and self._tests & self._test_dict['confusions']: if ( ' '.join((src_fn, src_ln)).strip() == ' '.join((tar_fn, tar_ln)).strip() ): return _fmt_retval(self._match_type_dict['confusions']) - if ratio >= word_approx_min: + if ratio >= self._word_approx_min: return _fmt_retval(self._match_type_dict['word_approx']) - if tests & self._test_dict['char_approx']: - if ca_ratio >= char_approx_min: + if self._tests & self._test_dict['char_approx']: + if ca_ratio >= self._char_approx_min: return _fmt_retval(self._match_type_dict['char_approx']) return _fmt_retval(self._match_type_dict['no_match']) - def dist( - self, - src, - tar, - word_approx_min=0.3, - char_approx_min=0.73, - tests=2 ** 12 - 1, - ): + def dist(self, src, tar): """Return the normalized Synoname distance between two words. Parameters @@ -726,28 +758,27 @@ def dist( Source string for comparison tar : str Target string for comparison - word_approx_min : float - The minimum word approximation value to signal a 'word_approx' - match - char_approx_min : float - The minimum character approximation value to signal a 'char_approx' - match - tests : int or Iterable - Either an integer indicating tests to perform or a list of test - names to perform (defaults to performing all tests) Returns ------- float Normalized Synoname distance + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return ( - synoname(src, tar, word_approx_min, char_approx_min, tests, False) - / 14 - ) + return self.dist_abs(src, tar, force_numeric=True) / 14 +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Synoname.dist_abs method instead.', +) def synoname( src, tar, @@ -796,10 +827,12 @@ def synoname( ... ret_name=True) 'word_approx' + .. versionadded:: 0.3.0 + """ - return Synoname().dist_abs( - src, tar, word_approx_min, char_approx_min, tests, ret_name - ) + return Synoname( + word_approx_min, char_approx_min, tests, ret_name + ).dist_abs(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_tarantula.py b/abydos/distance/_tarantula.py new file mode 100644 index 000000000..e9a790bd9 --- /dev/null +++ b/abydos/distance/_tarantula.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tarantula. + +Tarantula similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Tarantula'] + + +class Tarantula(_TokenDistance): + r"""Tarantula similarity. + + For two sets X and Y and a population N, Tarantula similarity + :cite:`Jones:2005` is + + .. math:: + + sim_{Tarantula}(X, Y) = + \frac{\frac{|X \cap Y|}{|X \cap Y| + |X \setminus Y|}} + {\frac{|X \cap Y|}{|X \cap Y| + |X \setminus Y|} + + \frac{|Y \setminus X|} + {|Y \setminus X| + |(N \setminus X) \setminus Y|}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Tarantula} = + \frac{\frac{a}{a+b}}{\frac{a}{a+b} + \frac{c}{c+d}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Tarantula instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Tarantula, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Tarantula similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tarantula similarity + + Examples + -------- + >>> cmp = Tarantula() + >>> cmp.sim('cat', 'hat') + 0.9948979591836735 + >>> cmp.sim('Niall', 'Neil') + 0.98856416772554 + >>> cmp.sim('aluminum', 'Catalan') + 0.9249106078665077 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * (c + d) + if num: + return num / (a * (2 * c + d) + b * c) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tarwid.py b/abydos/distance/_tarwid.py new file mode 100644 index 000000000..61426e632 --- /dev/null +++ b/abydos/distance/_tarwid.py @@ -0,0 +1,184 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tarwid. + +Tarwid correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Tarwid'] + + +class Tarwid(_TokenDistance): + r"""Tarwid correlation. + + For two sets X and Y and a population N, the Tarwid correlation + :cite:`Tarwid:1960` is + + .. math:: + + corr_{Tarwid}(X, Y) = + \frac{|N| \cdot |X \cap Y| - |X| \cdot |Y|} + {|N| \cdot |X \cap Y| + |X| \cdot |Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Tarwid} = + \frac{na-(a+b)(a+c)}{na+(a+b)(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Tarwid instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Tarwid, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Tarwid correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tarwid correlation + + Examples + -------- + >>> cmp = Tarwid() + >>> cmp.corr('cat', 'hat') + 0.9797979797979798 + >>> cmp.corr('Niall', 'Neil') + 0.9624530663329162 + >>> cmp.corr('aluminum', 'Catalan') + 0.8319719953325554 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + nta = self._population_unique_card() * self._intersection_card() + abtac = self._src_card() * self._tar_card() + + if nta == abtac: + return 0.0 + return (nta - abtac) / (nta + abtac) + + def sim(self, src, tar): + """Return the Tarwid similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tarwid similarity + + Examples + -------- + >>> cmp = Tarwid() + >>> cmp.sim('cat', 'hat') + 0.9898989898989898 + >>> cmp.sim('Niall', 'Neil') + 0.981226533166458 + >>> cmp.sim('aluminum', 'Catalan') + 0.9159859976662776 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tetrachoric.py b/abydos/distance/_tetrachoric.py new file mode 100644 index 000000000..a450c7339 --- /dev/null +++ b/abydos/distance/_tetrachoric.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tetrachoric. + +Tetrachoric correlation coefficient +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import cos, pi + +from ._token_distance import _TokenDistance + +__all__ = ['Tetrachoric'] + + +class Tetrachoric(_TokenDistance): + r"""Tetrachoric correlation coefficient. + + For two sets X and Y and a population N, the Tetrachoric correlation + coefficient :cite:`Pearson:1900` is + + .. math:: + + corr_{Tetrachoric}(X, Y) = \cos \Big(\frac{\pi + \sqrt{|X \setminus Y| \cdot |Y \setminus X|}} + {\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + \sqrt{|X \setminus Y| \cdot |Y \setminus X|}}\Big) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Tetrachoric} = + \cos \frac{\pi\sqrt{bc}}{\sqrt{ad}+\sqrt{bc}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Tetrachoric instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Tetrachoric, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Tetrachoric correlation coefficient of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tetrachoric correlation coefficient + + Examples + -------- + >>> cmp = Tetrachoric() + >>> cmp.corr('cat', 'hat') + 0.9885309061036239 + >>> cmp.corr('Niall', 'Neil') + 0.9678978997263907 + >>> cmp.corr('aluminum', 'Catalan') + 0.7853000893691571 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + rad = ( + self._intersection_card() * self._total_complement_card() + ) ** 0.5 + rbc = (self._src_only_card() * self._tar_only_card()) ** 0.5 + + if rbc: + return cos(pi * rbc / (rad + rbc)) + return 1.0 + + def sim(self, src, tar): + """Return the Tetrachoric correlation coefficient of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tetrachoric correlation coefficient + + Examples + -------- + >>> cmp = Tetrachoric() + >>> cmp.sim('cat', 'hat') + 0.994265453051812 + >>> cmp.sim('Niall', 'Neil') + 0.9839489498631954 + >>> cmp.sim('aluminum', 'Catalan') + 0.8926500446845785 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tf_idf.py b/abydos/distance/_tf_idf.py new file mode 100644 index 000000000..5b1941224 --- /dev/null +++ b/abydos/distance/_tf_idf.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tf_idf. + +TF-IDF similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log1p + +from ._token_distance import _TokenDistance +from ..corpus import UnigramCorpus + +__all__ = ['TFIDF'] + + +class TFIDF(_TokenDistance): + r"""TF-IDF similarity. + + For two sets X and Y and a population N, TF-IDF similarity + :cite:`Cohen:2003` is + + .. math:: + + sim_{TF-IDF}(X, Y) = \sum_{w \in X \cap Y} V(w, X) \cdot V(w, Y) + + V(w, S) = \frac{V'(w, S)}{\sqrt{\sum_{w \in S} V'(w, S)^2}} + + V'(w, S) = log(1+TF_{w,S}) \cdot log(1+IDF_w) + + Notes + ----- + One is added to both the TF & IDF values before taking the logarithm to + ensure the logarithms do not fall to 0, which will tend to result in 0.0 + similarities even when there is a degree of matching. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, tokenizer=None, corpus=None, **kwargs): + """Initialize TFIDF instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + corpus : UnigramCorpus + A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will + be created from the two words when a similarity function is called. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(TFIDF, self).__init__(tokenizer=tokenizer, **kwargs) + self._corpus = corpus + + def sim(self, src, tar): + """Return the TF-IDF similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + TF-IDF similarity + + Examples + -------- + >>> cmp = TFIDF() + >>> cmp.sim('cat', 'hat') + 0.30404449697373 + >>> cmp.sim('Niall', 'Neil') + 0.20108911303601 + >>> cmp.sim('aluminum', 'Catalan') + 0.05355175631194 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + src_tok, tar_tok = self._get_tokens() + + if self._corpus is None: + corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer']) + corpus.add_document(src) + corpus.add_document(tar) + else: + corpus = self._corpus + + vws_dict = {} + vwt_dict = {} + for token in src_tok.keys(): + vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token) + for token in tar_tok.keys(): + vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token) + + vws_rss = sum(score ** 2 for score in vws_dict.values()) ** 0.5 + vwt_rss = sum(score ** 2 for score in vwt_dict.values()) ** 0.5 + + return float( + round( + sum( + vws_dict[token] / vws_rss * vwt_dict[token] / vwt_rss + for token in self._intersection().keys() + ), + 14, + ) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tichy.py b/abydos/distance/_tichy.py new file mode 100644 index 000000000..c8ce4ab1c --- /dev/null +++ b/abydos/distance/_tichy.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tichy. + +Tichy edit distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._distance import _Distance + +__all__ = ['Tichy'] + + +class Tichy(_Distance): + """Tichy edit distance. + + Tichy described an algorithm, implemented below, in :cite:`Tichy:1984`. + Following this, :cite:`Cormode:2003` identifies an interpretation of this + algorithm's output as a distance measure, which is largely followed by the + methods below. + + Tichy's algorithm locates substrings of a string S to be copied in order + to create a string T. The only other operation used by his algorithms for + string reconstruction are add operations. + + Notes + ----- + While :cite:`Cormode:2003` counts only move operations to calculate + distance, I give the option (enabled by default) of counting add operations + as part of the distance measure. To ignore the cost of add operations, set + the cost value to (1, 0), for example, when initializing the object. + Further, in the case that S and T are identical, a distance of 0 will be + returned, even though this would still be counted as a single move + operation spanning the whole of string S. + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, cost=(1, 1), **kwargs): + """Initialize Tichy instance. + + Parameters + ---------- + cost : tuple + A 2-tuple representing the cost of the two possible edits: + block moves and adds (by default: (1, 1)) + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Tichy, self).__init__(**kwargs) + self._cost = cost + + def dist_abs(self, src, tar): + """Return the Tichy distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int (may return a float if cost has float values) + The Tichy distance between src & tar + + Examples + -------- + >>> cmp = Tichy() + >>> cmp.dist_abs('cat', 'hat') + 2 + >>> cmp.dist_abs('Niall', 'Neil') + 4 + >>> cmp.dist_abs('aluminum', 'Catalan') + 6 + >>> cmp.dist_abs('ATCG', 'TAGC') + 4 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0 + + def _find_max_block(src, tar, q_pos): + length = 0 + p_pos = 0 + p_cur = 0 + + while p_cur + length <= src_len and q_pos + length <= tar_len: + length_cur = 0 + while ( + p_cur + length_cur < src_len + and q_pos + length_cur < tar_len + and src[p_cur + length_cur] == tar[q_pos + length_cur] + ): + length_cur += 1 + if length_cur > length: + length = length_cur + p_pos = p_cur + p_cur += 1 + return p_pos, length + + moves = 0 + adds = 0 + src_len = len(src) + tar_len = len(tar) + q_pos = 0 + + while q_pos < tar_len: + p_pos, length = _find_max_block(src, tar, q_pos) + if length > 0: + moves += 1 + else: + adds += 1 + q_pos += max(1, length) + + return moves * self._cost[0] + adds * self._cost[1] + + def dist(self, src, tar): + """Return the normalized Tichy edit distance between two strings. + + The Tichy distance is normalized by dividing the distance by the length + of the tar string. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The normalized Tichy distance between src & tar + + Examples + -------- + >>> cmp = Tichy() + >>> round(cmp.dist('cat', 'hat'), 12) + 0.666666666667 + >>> round(cmp.dist('Niall', 'Neil'), 12) + 1.0 + >>> cmp.dist('aluminum', 'Catalan') + 0.8571428571428571 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + score = self.dist_abs(src, tar) + if score: + return score / (len(tar) * max(self._cost)) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_token_distance.py b/abydos/distance/_token_distance.py index 25b788652..9e5f41b01 100644 --- a/abydos/distance/_token_distance.py +++ b/abydos/distance/_token_distance.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -18,7 +18,8 @@ """abydos.distance._token_distance. -The distance._TokenDistance module implements abstract class _TokenDistance. +The distance._token_distance._TokenDistance module implements abstract class +_TokenDistance. """ from __future__ import ( @@ -28,16 +29,255 @@ unicode_literals, ) -from collections import Counter +from collections import Counter, OrderedDict +from itertools import product +from math import exp, log1p +from numpy import copy as np_copy +from numpy import zeros as np_zeros + +try: + from scipy.optimize import linear_sum_assignment +except ImportError: # pragma: no cover + # If the system lacks the scipy library, we'll fall back to our + # Python+Numpy implementation of the Hungarian algorithm + linear_sum_assignment = None + +from ._damerau_levenshtein import DamerauLevenshtein from ._distance import _Distance -from ..tokenizer import QGrams +from ._lcprefix import LCPrefix +from ._levenshtein import Levenshtein +from ..stats import ConfusionTable +from ..tokenizer import QGrams, QSkipgrams, WhitespaceTokenizer + +__all__ = ['_TokenDistance'] class _TokenDistance(_Distance): - """Abstract Token Distance class.""" + r"""Abstract Token Distance class. + + .. _confusion_table: + + +----------------+--------------+-----------------+-------+ + | | |in| ``tar`` | |notin| ``tar`` | | + +----------------+--------------+-----------------+-------+ + | |in| ``src`` | |a| | |b| | |a+b| | + +----------------+--------------+-----------------+-------+ + | |notin| ``src``| |c| | |d| | |c+d| | + +----------------+--------------+-----------------+-------+ + | | |a+c| | |b+d| | |n| | + +----------------+--------------+-----------------+-------+ + + .. |in| replace:: :math:`x \in` + .. |notin| replace:: :math:`x \notin` + + .. |a| replace:: :math:`a = |X \cap Y|` + .. |b| replace:: :math:`b = |X\setminus Y|` + .. |c| replace:: :math:`c = |Y \setminus X|` + .. |d| replace:: :math:`d = |(N\setminus X)\setminus Y|` + .. |n| replace:: :math:`n = |N|` + .. |a+b| replace:: :math:`p_1 = a+b = |X|` + .. |a+c| replace:: :math:`p_2 = a+c = |Y|` + .. |c+d| replace:: :math:`q_1 = c+d = |N\setminus X|` + .. |b+d| replace:: :math:`q_2 = b+d = |N\setminus Y|` + + .. versionadded:: 0.3.6 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + r"""Initialize _TokenDistance instance. + + .. _intersection_type: + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + + - 'crisp': Ordinary intersection, wherein items are entirely + members or non-members of the intersection. (Default) + - ``fuzzy``: Fuzzy intersection, defined by :cite:`Wang:2014`, + wherein items can be partially members of the intersection + if their similarity meets or exceeds a threshold value. This + also takes `metric` (by default :class:`Levenshtein()`) and + `threshold` (by default 0.8) parameters. + - ``soft``: Soft intersection, defined by :cite:`Russ:2014`, + wherein items can be partially members of the intersection + depending on their similarity. This also takes a `metric` + (by default :class:`DamerauLevenshtein()`) parameter. + - ``linkage``: Group linkage, defined by :cite:`On:2007`. Like + the soft intersection, items can be partially members of the + intersection, but the method of pairing similar members is + somewhat more complex. See the cited paper for details. This + also takes `metric` + (by default :class:`DamerauLevenshtein()`) and `threshold` + (by default 0.1) parameters. + **kwargs + Arbitrary keyword arguments + + + .. _alphabet: + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + + - If a Counter is supplied, it is used directly in computing + the complement of the tokens in both sets. + - If a collection is supplied, it is converted to a Counter + and used directly. In the case of a single string being + supplied and the QGram tokenizer being used, the full + alphabet is inferred (i.e. + :math:`len(set(alphabet+QGrams.start\_stop))^{QGrams.qval}` + is used as the cardinality of the full alphabet. + - If an int is supplied, it is used as the cardinality of the + full alphabet. + - If None is supplied, the cardinality of the full alphabet + is inferred if QGram of QSkipgrams tokenization is used (i.e. + :math:`28^{QGrams.qval}` is used as the cardinality of the + full alphabet or :math:`26` if QGrams.qval is 1, which + assumes the strings are English language strings and only + contain letters of a single case). Otherwise, the cardinality + of the complement of the total will be 0. + normalizer : str + This represents the normalization applied to the values in the + 2x2 contingency table prior to any of the cardinality (\*_card) + methods returning a value. By default, no normalization is applied, + but the following values are supported: + + - ``proportional`` : :math:`\frac{x}{n}`, where n is the total + population + - ``log`` : :math:`log(1+x)` + - ``exp`` : :math:`e^x` + - ``laplace`` : :math:`x+1` + - ``inverse`` : :math:`\frac{1}{x}` + - ``complement`` : :math:`n-x`, where n is the total population + internal_assignment_problem : bool + When using ``linkage`` as the intersection type (i.e. group + linkage), this forces use of the internal implementation to solve + the assignment problem, rather than scipy's linear_sum_assignment. + + .. versionadded:: 0.4.0 + + """ + super(_TokenDistance, self).__init__(**kwargs) + + qval = 2 if 'qval' not in self.params else self.params['qval'] + self.params['tokenizer'] = ( + tokenizer + if tokenizer is not None + else WhitespaceTokenizer() + if qval == 0 + else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) + ) + + if hasattr(self.params['tokenizer'], 'qval'): + if isinstance(self.params['tokenizer'].qval, int): + qvals = [self.params['tokenizer'].qval] + else: + qvals = list(self.params['tokenizer'].qval) + else: + qvals = [] + + if 'alphabet' in self.params: + if isinstance(self.params['alphabet'], str): + self.params['alphabet'] = set(self.params['alphabet']) + if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): + self.params['alphabet'] |= set( + self.params['tokenizer'].start_stop + ) + self.params['alphabet'] = sum( + len(self.params['alphabet']) ** qval for qval in qvals + ) + if hasattr(self.params['alphabet'], '__len__') and not isinstance( + self.params['alphabet'], Counter + ): + self.params['alphabet'] = len(self.params['alphabet']) + elif self.params['alphabet'] is None and isinstance( + self.params['tokenizer'], (QGrams, QSkipgrams) + ): + self.params['alphabet'] = sum( + 28 ** qval if qval > 1 else 26 for qval in qvals + ) + else: + if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): + self.params['alphabet'] = sum( + 28 ** qval if qval > 1 else 26 for qval in qvals + ) + else: + self.params['alphabet'] = None + + if intersection_type == 'soft': + if 'metric' not in self.params or self.params['metric'] is None: + self.params['metric'] = DamerauLevenshtein() + self._lcprefix = LCPrefix() + self._intersection = self._soft_intersection + elif intersection_type == 'fuzzy': + if 'metric' not in self.params or self.params['metric'] is None: + self.params['metric'] = Levenshtein() + if 'threshold' not in self.params: + self.params['threshold'] = 0.8 + self._intersection = self._fuzzy_intersection + elif intersection_type == 'linkage': + if 'metric' not in self.params or self.params['metric'] is None: + self.params['metric'] = DamerauLevenshtein() + if 'threshold' not in self.params: + self.params['threshold'] = 0.1 + self._intersection = self._group_linkage_intersection + else: + self._intersection = self._crisp_intersection + + self._src_tokens = Counter() + self._tar_tokens = Counter() + self._population_card_value = 0 + + # initialize normalizer + self.normalizer = self._norm_none + + self._norm_dict = { + 'proportional': self._norm_proportional, + 'log': self._norm_log, + 'exp': self._norm_exp, + 'laplace': self._norm_laplace, + 'inverse': self._norm_inverse, + 'complement': self._norm_complement, + } + + def _norm_none(self, x, _squares, _pop): + return x - def _get_qgrams(self, src, tar, qval=0, skip=0): + def _norm_proportional(self, x, _squares, pop): + return x / max(1, pop) + + def _norm_log(self, x, _squares, _pop): + return log1p(x) + + def _norm_exp(self, x, _squares, _pop): + return exp(x) + + def _norm_laplace(self, x, squares, _pop): + return x + squares + + def _norm_inverse(self, x, _squares, pop): + return 1 / x if x else pop + + def _norm_complement(self, x, _squares, pop): + return pop - x + + def _tokenize(self, src, tar): """Return the Q-Grams in src & tar. Parameters @@ -46,11 +286,6 @@ def _get_qgrams(self, src, tar, qval=0, skip=0): Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version - skip : int - The number of characters to skip (only works when src and tar are - strings) Returns ------- @@ -60,16 +295,433 @@ def _get_qgrams(self, src, tar, qval=0, skip=0): Examples -------- >>> pe = _TokenDistance() - >>> pe._get_qgrams('AT', 'TT', qval=2) - (QGrams({'$A': 1, 'AT': 1, 'T#': 1}), - QGrams({'$T': 1, 'TT': 1, 'T#': 1})) + >>> pe._tokenize('AT', 'TT')._get_tokens() + (Counter({'$A': 1, 'AT': 1, 'T#': 1}), + Counter({'$T': 1, 'TT': 1, 'T#': 1})) + + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ + self._src_orig = src + self._tar_orig = tar + + if isinstance(src, Counter): + self._src_tokens = src + else: + self._src_tokens = ( + self.params['tokenizer'].tokenize(src).get_counter() + ) + if isinstance(src, Counter): + self._tar_tokens = tar + else: + self._tar_tokens = ( + self.params['tokenizer'].tokenize(tar).get_counter() + ) + + self._population_card_value = self._calc_population_card() + + # Set up the normalizer, a function of two variables: + # x is the value in the contingency table square(s) + # n is the number of squares that x represents + if ( + 'normalizer' in self.params + and self.params['normalizer'] in self._norm_dict + ): + self.normalizer = self._norm_dict[self.params['normalizer']] + + return self + def _get_tokens(self): + """Return the src and tar tokens as a tuple.""" + return self._src_tokens, self._tar_tokens + + def _src_card(self): + r"""Return the cardinality of the tokens in the source set.""" + return self.normalizer( + sum(abs(val) for val in self._src_tokens.values()), + 2, + self._population_card_value, + ) + + def _src_only(self): + r"""Return the src tokens minus the tar tokens. + + For (multi-)sets S and T, this is :math:`S \setminus T`. + """ + return self._src_tokens - self._intersection() + + def _src_only_card(self): + """Return the cardinality of the tokens only in the source set.""" + return self.normalizer( + sum(abs(val) for val in self._src_only().values()), + 1, + self._population_card_value, + ) + + def _tar_card(self): + r"""Return the cardinality of the tokens in the target set.""" + return self.normalizer( + sum(abs(val) for val in self._tar_tokens.values()), + 2, + self._population_card_value, + ) + + def _tar_only(self): + r"""Return the tar tokens minus the src tokens. + + For (multi-)sets S and T, this is :math:`T \setminus S`. """ - if isinstance(src, Counter) and isinstance(tar, Counter): - return src, tar - if qval > 0: - return QGrams(src, qval, '$#', skip), QGrams(tar, qval, '$#', skip) - return Counter(src.strip().split()), Counter(tar.strip().split()) + return self._tar_tokens - self._intersection() + + def _tar_only_card(self): + """Return the cardinality of the tokens only in the target set.""" + return self.normalizer( + sum(abs(val) for val in self._tar_only().values()), + 1, + self._population_card_value, + ) + + def _symmetric_difference(self): + r"""Return the symmetric difference of tokens from src and tar. + + For (multi-)sets S and T, this is :math:`S \triangle T`. + """ + return self._src_only() + self._tar_only() + + def _symmetric_difference_card(self): + """Return the cardinality of the symmetric difference.""" + return self.normalizer( + sum(abs(val) for val in self._symmetric_difference().values()), + 2, + self._population_card_value, + ) + + def _total(self): + """Return the sum of the sets. + + For (multi-)sets S and T, this is :math:`S + T`. + + In the case of multisets, this counts values in the interesection + twice. In the case of sets, this is identical to the union. + """ + return self._src_tokens + self._tar_tokens + + def _total_card(self): + """Return the cardinality of the complement of the total.""" + return self.normalizer( + sum(abs(val) for val in self._total().values()), + 3, + self._population_card_value, + ) + + def _total_complement_card(self): + """Return the cardinality of the complement of the total.""" + if self.params['alphabet'] is None: + return self.normalizer(0, 1, self._population_card_value) + elif isinstance(self.params['alphabet'], Counter): + return self.normalizer( + max( + 0, + sum( + abs(val) + for val in ( + self.params['alphabet'] - self._total() + ).values() + ), + ), + 1, + self._population_card_value, + ) + return self.normalizer( + max(0, self.params['alphabet'] - len(self._total().values())), + 1, + self._population_card_value, + ) + + def _calc_population_card(self): + """Return the cardinality of the population.""" + save_normalizer = self.normalizer + self.normalizer = self._norm_none + pop = self._total_card() + self._total_complement_card() + self.normalizer = save_normalizer + return pop + + def _population_card(self): + """Return the cardinality of the population.""" + return self.normalizer( + self._population_card_value, 4, self._population_card_value + ) + + def _population_unique_card(self): + """Return the cardinality of the population minus the intersection.""" + return self.normalizer( + self._population_card_value - self._intersection_card(), + 4, + self._population_card_value, + ) + + def _union(self): + r"""Return the union of tokens from src and tar. + + For (multi-)sets S and T, this is :math:`S \cup T`. + """ + return self._total() - self._intersection() + + def _union_card(self): + """Return the cardinality of the union.""" + return self.normalizer( + sum(abs(val) for val in self._union().values()), + 3, + self._population_card_value, + ) + + def _difference(self): + """Return the difference of the tokens, supporting negative values.""" + _src_copy = Counter(self._src_tokens) + _src_copy.subtract(self._tar_tokens) + return _src_copy + + def _crisp_intersection(self): + r"""Return the intersection of tokens from src and tar. + + For (multi-)sets S and T, this is :math:`S \cap T`. + """ + return self._src_tokens & self._tar_tokens + + def _soft_intersection(self): + """Return the soft intersection of the tokens in src and tar. + + This implements the soft intersection defined by :cite:`Russ:2014`. + """ + intersection = self._crisp_intersection() + src_only = self._src_tokens - self._tar_tokens + tar_only = self._tar_tokens - self._src_tokens + + def _membership(src, tar): + greater_length = max(len(src), len(tar)) + return ( + max( + greater_length - self.params['metric'].dist_abs(src, tar), + self._lcprefix.dist_abs(src, tar), + ) + / greater_length + ) + + # Dictionary ordering is important for reproducibility, so insertion + # order needs to be controlled and retained. + memberships = OrderedDict( + ((src, tar), _membership(src, tar)) + for src, tar in sorted(product(src_only, tar_only)) + ) + + while memberships: + src_tok, tar_tok = max(memberships, key=memberships.get) + if memberships[src_tok, tar_tok] > 0.0: + pairings = min(src_only[src_tok], tar_only[tar_tok]) + if pairings: + intersection[src_tok] += ( + memberships[src_tok, tar_tok] * pairings / 2 + ) + intersection[tar_tok] += ( + memberships[src_tok, tar_tok] * pairings / 2 + ) + src_only[src_tok] -= pairings + tar_only[tar_tok] -= pairings + del memberships[src_tok, tar_tok] + + return intersection + + def _fuzzy_intersection(self): + r"""Return the fuzzy intersection of the tokens in src and tar. + + This implements the fuzzy intersection defined by :cite:`Wang:2014`. + + For two sets X and Y, the intersection :cite:`Wang:2014` is the sum of + similarities of all tokens in the two sets that are greater than or + equal to some threshold value (:math:`\delta`). + + The lower bound of on this intersection and the value when + :math:`\delta = 1.0`, is the crisp intersection. Tokens shorter than + :math:`\frac{\delta}{1-\delta}`, 4 in the case of the default threshold + :math:`\delta = 0.8`, must match exactly to be included in the + intersection. + + + .. versionadded:: 0.4.0 + + """ + intersection = self._crisp_intersection() + src_only = self._src_tokens - self._tar_tokens + tar_only = self._tar_tokens - self._src_tokens + + for src_tok in src_only: + for tar_tok in tar_only: + sim = self.params['metric'].sim(src_tok, tar_tok) + if sim >= self.params['threshold']: + intersection[src_tok] += (sim / 2) * src_only[src_tok] + intersection[tar_tok] += (sim / 2) * tar_only[tar_tok] + + return intersection + + def _group_linkage_intersection(self): + r"""Return the group linkage intersection of the tokens in src and tar. + + This is based on group linkage, as defined by :cite:`On:2007`. + + Most of this method is concerned with solving the assignment problem, + in order to find the weight of the maximum weight bipartite matching. + If the system has SciPy installed, we use it's linear_sum_assignment + function to get the assignments. Otherwise, we use the Hungarian + algorithm of Munkres :cite:`Munkres:1957`, implemented in Python & + Numpy. + + .. versionadded:: 0.4.0 + + """ + intersection = self._crisp_intersection() + src_only = sorted(self._src_tokens - self._tar_tokens) + tar_only = sorted(self._tar_tokens - self._src_tokens) + + if linear_sum_assignment and not ( + 'internal_assignment_problem' in self.params + and self.params['internal_assignment_problem'] + ): + arr = np_zeros((len(tar_only), len(src_only))) + + for col in range(len(src_only)): + for row in range(len(tar_only)): + arr[row, col] = self.params['metric'].dist( + src_only[col], tar_only[row] + ) + + for row, col in zip(*linear_sum_assignment(arr)): + sim = 1.0 - arr[row, col] + if sim >= self.params['threshold']: + intersection[src_only[col]] += (sim / 2) * ( + self._src_tokens - self._tar_tokens + )[src_only[col]] + intersection[tar_only[row]] += (sim / 2) * ( + self._tar_tokens - self._src_tokens + )[tar_only[row]] + else: + n = max(len(tar_only), len(src_only)) + arr = np_zeros((n, n), dtype=float) + + for col in range(len(src_only)): + for row in range(len(tar_only)): + arr[row, col] = self.params['metric'].dist( + src_only[col], tar_only[row] + ) + + src_only += [''] * (n - len(src_only)) + tar_only += [''] * (n - len(tar_only)) + + orig_sim = 1 - np_copy(arr) + + # Step 1 + for row in range(n): + arr[row, :] -= arr[row, :].min() + # Step 2 + for col in range(n): + arr[:, col] -= arr[:, col].min() + + while True: + # Step 3 + assignments = {} + + allocated_cols = set() + allocated_rows = set() + assigned_rows = set() + assigned_cols = set() + + for row in range(n): + if (arr[row, :] == 0.0).sum() == 1: + col = arr[row, :].argmin() + if col not in allocated_cols: + assignments[row, col] = orig_sim[row, col] + allocated_cols.add(col) + assigned_rows.add(row) + assigned_cols.add(col) + + for col in range(n): + if (arr[:, col] == 0.0).sum() == 1: + row = arr[:, col].argmin() + if row not in allocated_rows: + assignments[row, col] = orig_sim[row, col] + allocated_rows.add(row) + assigned_rows.add(row) + assigned_cols.add(col) + + if len(assignments) == n: + break + + marked_rows = {_ for _ in range(n) if _ not in assigned_rows} + marked_cols = set() + for row in sorted(set(marked_rows)): + for col, mark in enumerate(arr[row, :] == 0.0): + if mark: + marked_cols.add(col) + for row2 in range(n): + if (row2, col) in assignments: + marked_rows.add(row2) + + if n - len(marked_rows) + len(marked_cols) == n: + # We have sufficient lines + for col in range(n): + row = arr[:, col].argmin() + assignments[row, col] = orig_sim[row, col] + break + + # Step 4 + min_val = arr[tuple(marked_rows), :][ + :, sorted(set(range(n)) - marked_cols) + ].min() + for row in range(n): + for col in range(n): + if row in marked_rows and col not in marked_cols: + arr[row, col] -= min_val + elif row not in marked_rows and col in marked_cols: + arr[row, col] += min_val + + for row, col in assignments.keys(): + sim = orig_sim[row, col] + if sim >= self.params['threshold']: + intersection[src_only[col]] += (sim / 2) * ( + self._src_tokens - self._tar_tokens + )[src_only[col]] + intersection[tar_only[row]] += (sim / 2) * ( + self._tar_tokens - self._src_tokens + )[tar_only[row]] + + return intersection + + def _intersection_card(self): + """Return the cardinality of the intersection.""" + return self.normalizer( + sum(abs(val) for val in self._intersection().values()), + 1, + self._population_card_value, + ) + + def _intersection(self): + """Return the intersection. + + This function may be overridden by setting the intersection_type during + initialization. + """ + return self._crisp_intersection() # pragma: no cover + + def _get_confusion_table(self): + """Return the token counts as a ConfusionTable object.""" + return ConfusionTable( + self._intersection_card(), + self._total_complement_card(), + self._src_only_card(), + self._tar_only_card(), + ) if __name__ == '__main__': diff --git a/abydos/distance/_tulloss_r.py b/abydos/distance/_tulloss_r.py new file mode 100644 index 000000000..0011d8b30 --- /dev/null +++ b/abydos/distance/_tulloss_r.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tulloss_r. + +Tulloss' R similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['TullossR'] + + +class TullossR(_TokenDistance): + r"""Tulloss' R similarity. + + For two sets X and Y and a population N, Tulloss' R similarity + :cite:`Tulloss:1997` is + + .. math:: + + sim_{Tulloss_R}(X, Y) = + \frac{log(1+\frac{|X \cap Y|}{|X|}) \cdot log(1+\frac{|X \cap Y|} + {|Y|})}{log^2(2)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Tulloss_R} = + \frac{log(1+\frac{a}{a+b}) \cdot log(1+\frac{a}{a+c})}{log^2(2)} + + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize TullossR instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(TullossR, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return Tulloss' R similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tulloss' R similarity + + Examples + -------- + >>> cmp = TullossR() + >>> cmp.sim('cat', 'hat') + 0.34218112724994865 + >>> cmp.sim('Niall', 'Neil') + 0.2014703364316006 + >>> cmp.sim('aluminum', 'Catalan') + 0.025829125872886074 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + if not self._src_card() or not self._tar_card(): + return 0.0 + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return log(1 + a / (a + b)) * log(1 + a / (a + c)) / log(2) ** 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tulloss_s.py b/abydos/distance/_tulloss_s.py new file mode 100644 index 000000000..69b60cacc --- /dev/null +++ b/abydos/distance/_tulloss_s.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tulloss_s. + +Tulloss' S similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['TullossS'] + + +class TullossS(_TokenDistance): + r"""Tulloss' S similarity. + + For two sets X and Y and a population N, Tulloss' S similarity + :cite:`Tulloss:1997` is + + .. math:: + + sim_{Tulloss_S}(X, Y) = + \frac{1}{\sqrt{log_2(2+\frac{min(|X \setminus Y|, |Y \setminus X|)} + {|X \cap Y|+1})}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Tulloss_S} = + \frac{1}{\sqrt{log_2(2+\frac{min(b,c)}{a+1})}} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize TullossS instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(TullossS, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return Tulloss' S similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tulloss' S similarity + + Examples + -------- + >>> cmp = TullossS() + >>> cmp.sim('cat', 'hat') + 0.8406515643305636 + >>> cmp.sim('Niall', 'Neil') + 0.7943108670863426 + >>> cmp.sim('aluminum', 'Catalan') + 0.6376503816669968 + >>> cmp.sim('ATCG', 'TAGC') + 0.5968309535438173 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return 1 / (log(2 + min(b, c) / (a + 1), 2)) ** 0.5 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tulloss_t.py b/abydos/distance/_tulloss_t.py new file mode 100644 index 000000000..44e8f8f29 --- /dev/null +++ b/abydos/distance/_tulloss_t.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tulloss_t. + +Tulloss' T similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance +from ._tulloss_r import TullossR +from ._tulloss_s import TullossS +from ._tulloss_u import TullossU + +__all__ = ['TullossT'] + + +class TullossT(_TokenDistance): + r"""Tulloss' T similarity. + + For two sets X and Y and a population N, Tulloss' T similarity + :cite:`Tulloss:1997` is + + .. math:: + + \begin{array}{l} + sim_{Tulloss_T}(X, Y) = \sqrt{sim_{Tulloss_U}(X, Y) \cdot + sim_{Tulloss_S}(X, Y) \cdot sim_{Tulloss_R}(X, Y)} + + = \sqrt{ + log_2(1+\frac{min(|X \setminus Y|, |Y \setminus X|)+|X \cap Y|} + {max(|X \setminus Y|, |Y \setminus X|)+|X \cap Y|}) \cdot + \frac{1}{\sqrt{log_2(2+\frac{min(|X \setminus Y|, |Y \setminus X|)} + {|X \cap Y|+1})}} \cdot + \frac{log(1+\frac{|X \cap Y|}{|X|}) \cdot log(1+\frac{|X \cap Y|} + {|Y|})}{log^2(2)}} + \end{array} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Tulloss_T} = \sqrt{ + log_2\Big(1+\frac{min(b, c)+a}{max(b, c)+a}\Big) \cdot + \frac{1}{\sqrt{log_2(2+\frac{min(b,c)}{a+1})}} \cdot + \frac{log(1+\frac{a}{a+b}) \cdot log(1+\frac{a}{a+c})}{log^2(2)}} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize TullossT instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(TullossT, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + self._r = TullossR() + self._s = TullossS() + self._u = TullossU() + + def sim(self, src, tar): + """Return Tulloss' T similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tulloss' T similarity + + Examples + -------- + >>> cmp = TullossT() + >>> cmp.sim('cat', 'hat') + 0.5363348766461724 + >>> cmp.sim('Niall', 'Neil') + 0.37408737056893265 + >>> cmp.sim('aluminum', 'Catalan') + 0.12293007830952692 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + r = self._r.sim(src, tar) + s = self._s.sim(src, tar) + u = self._u.sim(src, tar) + + return (r * s * u) ** 0.5 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tulloss_u.py b/abydos/distance/_tulloss_u.py new file mode 100644 index 000000000..f5b970aa3 --- /dev/null +++ b/abydos/distance/_tulloss_u.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._tulloss_u. + +Tulloss' U similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['TullossU'] + + +class TullossU(_TokenDistance): + r"""Tulloss' U similarity. + + For two sets X and Y, Tulloss' U similarity + :cite:`Tulloss:1997` is + + .. math:: + + sim_{Tulloss_U}(X, Y) = + log_2\Big(1+\frac{min(|X \setminus Y|, |Y \setminus X|)+|X \cap Y|} + {max(|X \setminus Y|, |Y \setminus X|)+|X \cap Y|}\Big) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Tulloss_U} = + log_2\Big(1+\frac{min(b, c)+a}{max(b, c)+a}\Big) + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize TullossU instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(TullossU, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return Tulloss' U similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Tulloss' U similarity + + Examples + -------- + >>> cmp = TullossU() + >>> cmp.sim('cat', 'hat') + 1.0 + >>> cmp.sim('Niall', 'Neil') + 0.8744691179161412 + >>> cmp.sim('aluminum', 'Catalan') + 0.9175378398080271 + >>> cmp.sim('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return log(1 + (min(b, c) + a) / (max(b, c) + a), 2) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_tversky.py b/abydos/distance/_tversky.py index 0477f292b..49ccd7837 100644 --- a/abydos/distance/_tversky.py +++ b/abydos/distance/_tversky.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._token_distance import _TokenDistance +from .. import __version__ __all__ = ['Tversky', 'dist_tversky', 'sim_tversky'] @@ -38,8 +41,11 @@ class Tversky(_TokenDistance): The Tversky index :cite:`Tversky:1977` is defined as: For two sets X and Y: - :math:`sim_{Tversky}(X, Y) = \frac{|X \cap Y|} - {|X \cap Y| + \alpha|X - Y| + \beta|Y - X|}`. + + .. math:: + + sim_{Tversky}(X, Y) = \frac{|X \cap Y|} + {|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} :math:`\alpha = \beta = 1` is equivalent to the Jaccard & Tanimoto similarity coefficients. @@ -56,32 +62,77 @@ class Tversky(_TokenDistance): Parameter values' relation to 1 emphasizes different types of contributions: - - :math:`\alpha and \beta > 1` emphsize unique contributions over the - intersection - - :math:`\alpha and \beta < 1` emphsize the intersection over unique - contributions + - :math:`\alpha` and :math:`\beta > 1` emphsize unique contributions + over the intersection + - :math:`\alpha` and :math:`\beta < 1` emphsize the intersection over + unique contributions The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated by specifying a bias parameter. + + + .. versionadded:: 0.3.6 """ - def sim(self, src, tar, qval=2, alpha=1, beta=1, bias=None): - """Return the Tversky index of two strings. + def __init__( + self, + alpha=1.0, + beta=1.0, + bias=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Tversky instance. Parameters ---------- - src : str - Source string (or QGrams/Counter objects) for comparison - tar : str - Target string (or QGrams/Counter objects) for comparison - qval : int - The length of each q-gram; 0 for non-q-gram version alpha : float Tversky index parameter as described above beta : float Tversky index parameter as described above bias : float The symmetric Tversky index bias parameter + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Tversky, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + self.set_params(alpha=alpha, beta=beta, bias=bias) + + def sim(self, src, tar): + """Return the Tversky index of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison Returns ------- @@ -106,8 +157,13 @@ def sim(self, src, tar, qval=2, alpha=1, beta=1, bias=None): >>> cmp.sim('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - if alpha < 0 or beta < 0: + if self.params['alpha'] < 0 or self.params['beta'] < 0: raise ValueError( 'Unsupported weight assignment; alpha and beta ' + 'must be greater than or equal to 0.' @@ -118,32 +174,41 @@ def sim(self, src, tar, qval=2, alpha=1, beta=1, bias=None): elif not src or not tar: return 0.0 - q_src, q_tar = self._get_qgrams(src, tar, qval) - q_src_mag = sum(q_src.values()) - q_tar_mag = sum(q_tar.values()) - q_intersection_mag = sum((q_src & q_tar).values()) + self._tokenize(src, tar) - if not q_src or not q_tar: + q_src_mag = self._src_only_card() + q_tar_mag = self._tar_only_card() + q_intersection_mag = self._intersection_card() + + if not self._src_tokens or not self._tar_tokens: return 0.0 - if bias is None: + if self.params['bias'] is None: return q_intersection_mag / ( q_intersection_mag - + alpha * (q_src_mag - q_intersection_mag) - + beta * (q_tar_mag - q_intersection_mag) + + self.params['alpha'] * q_src_mag + + self.params['beta'] * q_tar_mag ) - a_val = min( - q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag - ) - b_val = max( - q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag + a_val, b_val = sorted((q_src_mag, q_tar_mag)) + c_val = q_intersection_mag + self.params['bias'] + return c_val / ( + self.params['beta'] + * ( + self.params['alpha'] * a_val + + (1 - self.params['alpha']) * b_val + ) + + c_val ) - c_val = q_intersection_mag + bias - return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val) -def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Tversky.sim method instead.', +) +def sim_tversky(src, tar, qval=2, alpha=1.0, beta=1.0, bias=None): """Return the Tversky index of two strings. This is a wrapper for :py:meth:`Tversky.sim`. @@ -155,7 +220,7 @@ def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram alpha : float Tversky index parameter as described above beta : float @@ -179,11 +244,20 @@ def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): >>> sim_tversky('ATCG', 'TAGC') 0.0 + + .. versionadded:: 0.1.0 + """ - return Tversky().sim(src, tar, qval, alpha, beta, bias) + return Tversky(alpha=alpha, beta=beta, bias=bias, qval=qval).sim(src, tar) -def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Tversky.dist method instead.', +) +def dist_tversky(src, tar, qval=2, alpha=1.0, beta=1.0, bias=None): """Return the Tversky distance between two strings. This is a wrapper for :py:meth:`Tversky.dist`. @@ -195,7 +269,7 @@ def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): tar : str Target string (or QGrams/Counter objects) for comparison qval : int - The length of each q-gram; 0 for non-q-gram version + The length of each q-gram alpha : float Tversky index parameter as described above beta : float @@ -219,8 +293,11 @@ def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): >>> dist_tversky('ATCG', 'TAGC') 1.0 + + .. versionadded:: 0.1.0 + """ - return Tversky().dist(src, tar, qval, alpha, beta, bias) + return Tversky(alpha=alpha, beta=beta, bias=bias, qval=qval).dist(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_typo.py b/abydos/distance/_typo.py index 9a1d36077..15e86097d 100644 --- a/abydos/distance/_typo.py +++ b/abydos/distance/_typo.py @@ -28,14 +28,19 @@ unicode_literals, ) +from itertools import chain from math import log +from deprecation import deprecated + from numpy import float32 as np_float32 from numpy import zeros as np_zeros from six.moves import range from ._distance import _Distance +from .. import __version__ + __all__ = ['Typo', 'dist_typo', 'sim_typo', 'typo'] @@ -46,6 +51,8 @@ class Typo(_Distance): This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of this was copied from that module. Compared to the original, this supports different metrics for substitution. + + .. versionadded:: 0.3.6 """ # fmt: off @@ -53,60 +60,64 @@ class Typo(_Distance): (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='), ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']', '\\'), - ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''), - ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')), + ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', "'"), + ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/'), + ('', '', '', ' ')), (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'), ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'), ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'), - ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?')) + ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'), + ('', '', '', ' ')) ), 'Dvorak': ( (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'), - ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=', + ('', "'", ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=', '\\'), ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'), - ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')), + ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z'), + ('', '', '', ' ')), (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'), ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'), ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'), - ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z')) + ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'), + ('', '', '', ' ')) ), 'AZERTY': ( - (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='), + (('²', '&', 'é', '"', "'", '(', '-', 'è', '_', 'ç', 'à', ')', '='), ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'), ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'), - ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')), + ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!'), + ('', '', '', ' ')), (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'), ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'), ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'), - ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§')) + ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'), + ('', '', '', ' ')) ), 'QWERTZ': ( (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''), ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+', '\\'), ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'), - ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')), + ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'), + ('', '', '', ' ')), (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''), ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''), - ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''), - ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_')) + ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', "'"), + ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'), + ('', '', '', ' ')) )} # fmt: on - def dist_abs( + def __init__( self, - src, - tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY', + failsafe=False, + **kwargs ): - """Return the typo distance between two strings. + """Initialize Typo instance. Parameters ---------- - src : str - Source string for comparison - tar : str - Target string for comparison metric : str Supported values include: ``euclidean``, ``manhattan``, ``log-euclidean``, and ``log-manhattan`` @@ -118,7 +129,35 @@ def dist_abs( a log metric is used. layout : str Name of the keyboard layout to use (Currently supported: - ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``) + ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``, ``auto``). If + ``auto`` is selected, the class will attempt to determine an + appropriate keyboard based on the supplied words. + failsafe : bool + If True, substitution of an unknown character (one not present on + the selected keyboard) will incur a cost equal to an insertion plus + a deletion. + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(Typo, self).__init__(**kwargs) + self._metric = metric + self._cost = cost + self._layout = layout + self._failsafe = failsafe + + def dist_abs(self, src, tar): + """Return the typo distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison Returns ------- @@ -142,26 +181,33 @@ def dist_abs( >>> cmp.dist_abs('ATCG', 'TAGC') 2.5 - >>> cmp.dist_abs('cat', 'hat', metric='manhattan') + >>> cmp = Typo(metric='manhattan') + >>> cmp.dist_abs('cat', 'hat') 2.0 - >>> cmp.dist_abs('Niall', 'Neil', metric='manhattan') + >>> cmp.dist_abs('Niall', 'Neil') 3.0 - >>> cmp.dist_abs('Colin', 'Cuilen', metric='manhattan') + >>> cmp.dist_abs('Colin', 'Cuilen') 3.5 - >>> cmp.dist_abs('ATCG', 'TAGC', metric='manhattan') + >>> cmp.dist_abs('ATCG', 'TAGC') 2.5 - >>> cmp.dist_abs('cat', 'hat', metric='log-manhattan') + >>> cmp = Typo(metric='log-manhattan') + >>> cmp.dist_abs('cat', 'hat') 0.804719 - >>> cmp.dist_abs('Niall', 'Neil', metric='log-manhattan') + >>> cmp.dist_abs('Niall', 'Neil') 2.2424533 - >>> cmp.dist_abs('Colin', 'Cuilen', metric='log-manhattan') + >>> cmp.dist_abs('Colin', 'Cuilen') 2.2424533 - >>> cmp.dist_abs('ATCG', 'TAGC', metric='log-manhattan') + >>> cmp.dist_abs('ATCG', 'TAGC') 2.3465736 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - ins_cost, del_cost, sub_cost, shift_cost = cost + ins_cost, del_cost, sub_cost, shift_cost = self._cost if src == tar: return 0.0 @@ -170,9 +216,22 @@ def dist_abs( if not tar: return len(src) * del_cost - keyboard = self._keyboard[layout] + if self._layout == 'auto': + for kb in ['QWERTY', 'QWERTZ', 'AZERTY']: + keys = set(chain(*chain(*self._keyboard[kb]))) + letters = set(src) | set(tar) + if not (letters - keys): + keyboard = self._keyboard[kb] + break + else: + # Fallback to QWERTY + keyboard = self._keyboard['QWERTY'] + else: + keyboard = self._keyboard[self._layout] + lowercase = {item for sublist in keyboard[0] for item in sublist} uppercase = {item for sublist in keyboard[1] for item in sublist} + keys = set(chain(*chain(*keyboard))) def _kb_array_for_char(char): """Return the keyboard layout that contains ch. @@ -192,6 +251,8 @@ def _kb_array_for_char(char): ValueError char not found in any keyboard layouts + .. versionadded:: 0.3.0 + """ if char in lowercase: return keyboard[0] @@ -200,8 +261,10 @@ def _kb_array_for_char(char): raise ValueError(char + ' not found in any keyboard layouts') def _substitution_cost(char1, char2): + if self._failsafe and (char1 not in keys or char2 not in keys): + return ins_cost + del_cost cost = sub_cost - cost *= metric_dict[metric](char1, char2) + shift_cost * ( + cost *= metric_dict[self._metric](char1, char2) + shift_cost * ( _kb_array_for_char(char1) != _kb_array_for_char(char2) ) return cost @@ -221,6 +284,8 @@ def _get_char_coord(char, kb_array): tuple The row & column of the key + .. versionadded:: 0.3.0 + """ for row in kb_array: # pragma: no branch if char in row: @@ -270,14 +335,7 @@ def _log_manhattan_keyboard_distance(char1, char2): return d_mat[len(src), len(tar)] - def dist( - self, - src, - tar, - metric='euclidean', - cost=(1, 1, 0.5, 0.5), - layout='QWERTY', - ): + def dist(self, src, tar): """Return the normalized typo distance between two strings. This is typo distance, normalized to [0, 1]. @@ -288,18 +346,6 @@ def dist( Source string for comparison tar : str Target string for comparison - metric : str - Supported values include: ``euclidean``, ``manhattan``, - ``log-euclidean``, and ``log-manhattan`` - cost : tuple - A 4-tuple representing the cost of the four possible edits: - inserts, deletes, substitutions, and shift, respectively (by - default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be - significantly less than the cost of an insertion & deletion unless - a log metric is used. - layout : str - Name of the keyboard layout to use (Currently supported: - ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``) Returns ------- @@ -318,15 +364,26 @@ def dist( >>> cmp.dist('ATCG', 'TAGC') 0.625 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if src == tar: return 0.0 - ins_cost, del_cost = cost[:2] - return self.dist_abs(src, tar, metric, cost, layout) / ( + ins_cost, del_cost = self._cost[:2] + return self.dist_abs(src, tar) / ( max(len(src) * del_cost, len(tar) * ins_cost) ) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Typo.dist_abs method instead.', +) def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'): """Return the typo distance between two strings. @@ -385,10 +442,18 @@ def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'): >>> typo('ATCG', 'TAGC', metric='log-manhattan') 2.3465736 + .. versionadded:: 0.3.0 + """ - return Typo().dist_abs(src, tar, metric, cost, layout) + return Typo(metric, cost, layout).dist_abs(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Typo.dist method instead.', +) def dist_typo( src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY' ): @@ -431,10 +496,18 @@ def dist_typo( >>> dist_typo('ATCG', 'TAGC') 0.625 + .. versionadded:: 0.3.0 + """ - return Typo().dist(src, tar, metric, cost, layout) + return Typo(metric, cost, layout).dist(src, tar) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Typo.sim method instead.', +) def sim_typo( src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY' ): @@ -477,8 +550,10 @@ def sim_typo( >>> sim_typo('ATCG', 'TAGC') 0.375 + .. versionadded:: 0.3.0 + """ - return Typo().sim(src, tar, metric, cost, layout) + return Typo(metric, cost, layout).sim(src, tar) if __name__ == '__main__': diff --git a/abydos/distance/_unigram_subtuple.py b/abydos/distance/_unigram_subtuple.py new file mode 100644 index 000000000..b802fe374 --- /dev/null +++ b/abydos/distance/_unigram_subtuple.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unigram_subtuple. + +Unigram subtuple similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['UnigramSubtuple'] + + +class UnigramSubtuple(_TokenDistance): + r"""Unigram subtuple similarity. + + For two sets X and Y and a population N, unigram subtuple similarity + :cite:`Pecina:2010` is + + .. math:: + + sim_{unigram~subtuple}(X, Y) = + log(\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + {|X \setminus Y| \cdot |Y \setminus Y|}) - 3.29 \cdot + \sqrt{\frac{1}{|X \cap Y|} + \frac{1}{|X \setminus Y|} + + \frac{1}{|Y \setminus X|} + + \frac{1}{|(N \setminus X) \setminus Y|}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{unigram~subtuple} = + log(\frac{ad}{bc}) - 3.29 \cdot + \sqrt{\frac{1}{a} + \frac{1}{b} + \frac{1}{c} + \frac{1}{d}} + + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnigramSubtuple instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnigramSubtuple, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the unigram subtuple similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unigram subtuple similarity + + Examples + -------- + >>> cmp = UnigramSubtuple() + >>> cmp.sim_score('cat', 'hat') + 1.9324426894059226 + >>> cmp.sim_score('Niall', 'Neil') + 1.4347242883606355 + >>> cmp.sim_score('aluminum', 'Catalan') + -1.0866724701675263 + >>> cmp.sim_score('ATCG', 'TAGC') + -0.461880260111438 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = max(1, self._intersection_card()) + b = max(1, self._src_only_card()) + c = max(1, self._tar_only_card()) + d = max(1, self._total_complement_card()) + + return ( + log(a * d / (b * c)) + - 3.29 * (1 / a + 1 / b + 1 / c + 1 / d) ** 0.5 + ) + + def sim(self, src, tar): + """Return the unigram subtuple similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unigram subtuple similarity + + Examples + -------- + >>> cmp = UnigramSubtuple() + >>> cmp.sim('cat', 'hat') + 0.6215275850074894 + >>> cmp.sim('Niall', 'Neil') + 0.39805896767519555 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + score = self.sim_score(src, tar) + if score < 0: + return 0.0 + return score / max(self.sim_score(src, src), self.sim_score(tar, tar)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_a.py b/abydos/distance/_unknown_a.py new file mode 100644 index 000000000..ab22ac481 --- /dev/null +++ b/abydos/distance/_unknown_a.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_a. + +Unknown A correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownA'] + + +class UnknownA(_TokenDistance): + r"""Unknown A correlation. + + For two sets X and Y and a population N, Unknown A correlation + is sometimes attributed to :cite:`Peirce:1884`. It differs from + :py:class:`Peirce` in that the numerator is the product of the opposite + pair of marginals: + + .. math:: + + corr_{UnknownA}(X, Y) = \frac{|X \cap Y| \cdot + |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus Y|} + {|Y| \cdot |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{UnknownA} = + \frac{ad-bc}{(a+c)(b+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownA instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownA, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Unknown A correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown A correlation + + Examples + -------- + >>> cmp = UnknownA() + >>> cmp.corr('cat', 'hat') + 0.49743589743589745 + >>> cmp.corr('Niall', 'Neil') + 0.39486521181001283 + >>> cmp.corr('aluminum', 'Catalan') + 0.1147039897039897 + >>> cmp.corr('ATCG', 'TAGC') + -0.006418485237483954 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d - b * c + if num: + return num / ((a + c) * (b + d)) + return 0.0 + + def sim(self, src, tar): + """Return the Unknown A similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown A similarity + + Examples + -------- + >>> cmp = UnknownA() + >>> cmp.sim('cat', 'hat') + 0.7487179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.6974326059050064 + >>> cmp.sim('aluminum', 'Catalan') + 0.5573519948519948 + >>> cmp.sim('ATCG', 'TAGC') + 0.496790757381258 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_b.py b/abydos/distance/_unknown_b.py new file mode 100644 index 000000000..12facabb8 --- /dev/null +++ b/abydos/distance/_unknown_b.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_b. + +Unknown B similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownB'] + + +class UnknownB(_TokenDistance): + r"""Unknown B similarity. + + For two sets X and Y and a population N, Unknown B similarity, which + :cite:`Morris:2012` attributes to :cite:`Doolittle:1884` but could not be + located in that source, is + + .. math:: + + sim_{UnknownB}(X, Y) = + \frac{(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|)^2} + {|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownB} = + \frac{(ad-bc)^2}{(a+b)(a+c)(b+d)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownB instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownB, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Unknown B similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown B similarity + + Examples + -------- + >>> cmp = UnknownB() + >>> cmp.sim('cat', 'hat') + 0.24744247205785666 + >>> cmp.sim('Niall', 'Neil') + 0.13009912077202224 + >>> cmp.sim('aluminum', 'Catalan') + 0.011710186806836291 + >>> cmp.sim('ATCG', 'TAGC') + 4.1196952743799446e-05 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = (a * d - b * c) ** 2 + if num: + return num / ((a + b) * (a + c) * (b + d) * (c + d)) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_c.py b/abydos/distance/_unknown_c.py new file mode 100644 index 000000000..acf4db722 --- /dev/null +++ b/abydos/distance/_unknown_c.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_c. + +Unknown C similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownC'] + + +class UnknownC(_TokenDistance): + r"""Unknown C similarity. + + For two sets X and Y and a population N, Unknown C similarity, which + :cite:`Morris:2012` attributes to :cite:`Gower:1971` but could not be + located in that source, is + + .. math:: + + sim_{UnknownC}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y|} + {\sqrt{|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownC} = + \frac{a+d}{\sqrt{(a+b)(a+c)(b+d)(c+d)}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownC instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownC, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Unknown C similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown C similarity + + Examples + -------- + >>> cmp = UnknownC() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.18222244271345164 + >>> cmp.sim('aluminum', 'Catalan') + 0.11686463498390019 + >>> cmp.sim('ATCG', 'TAGC') + 0.1987163029525032 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a + d + if num: + return ( + num + / ( + max(1, a + b) + * max(1, a + c) + * max(1, b + d) + * max(1, c + d) + ) + ** 0.5 + ) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_d.py b/abydos/distance/_unknown_d.py new file mode 100644 index 000000000..32a1ef8cf --- /dev/null +++ b/abydos/distance/_unknown_d.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_d. + +Unknown D similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownD'] + + +class UnknownD(_TokenDistance): + r"""Unknown D similarity. + + For two sets X and Y and a population N, Unknown D similarity, which + :cite:`Morris:2012` attributes to :cite:`Peirce:1884` but could not be + located in that source, is + + .. math:: + + sim_{UnknownD}(X, Y) = + \frac{|X \cap Y| \cdot |X \setminus Y| + + |X \setminus Y| \cdot |Y \setminus X|} + {|X \cap Y| \cdot |X \setminus Y| + + 2 \cdot |X \setminus Y| \cdot |Y \setminus X| + + |Y \setminus X| + |(N \setminus X) \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownD} = + \frac{ab+bc}{ab+2bc+cd} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownD instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownD, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Unknown D similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown D similarity + + Examples + -------- + >>> cmp = UnknownD() + >>> cmp.sim('cat', 'hat') + 0.00510204081632653 + >>> cmp.sim('Niall', 'Neil') + 0.00848536274925753 + >>> cmp.sim('aluminum', 'Catalan') + 0.011630019989096857 + >>> cmp.sim('ATCG', 'TAGC') + 0.006377551020408163 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * b + b * c + if num: + return num / (a * b + 2 * b * c + c * d) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_e.py b/abydos/distance/_unknown_e.py new file mode 100644 index 000000000..7aab9412e --- /dev/null +++ b/abydos/distance/_unknown_e.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_e. + +Unknown E correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownE'] + + +class UnknownE(_TokenDistance): + r"""Unknown E correlation. + + For two sets X and Y and a population N, Unknown E correlation, which + :cite:`Morris:2012` attributes to :cite:`Goodman:1954` but could not be + located in that source, is + + .. math:: + + corr_{UnknownE}(X, Y) = + \frac{2 \cdot min(|X \cap Y|, |(N \setminus X) \setminus Y|) - + |X \setminus Y| - |Y \setminus X|} + {2 \cdot min(|X \cap Y|, |(N \setminus X) \setminus Y|) + + |X \setminus Y| + |Y \setminus X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{UnknownE} = + \frac{2 \cdot min(a, d) - b - c}{2 \cdot min(a, d) + b + c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownE instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownE, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Unknown E correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown E correlation + + Examples + -------- + >>> cmp = UnknownE() + >>> cmp.corr('cat', 'hat') + 0.0 + >>> cmp.corr('Niall', 'Neil') + -0.2727272727272727 + >>> cmp.corr('aluminum', 'Catalan') + -0.7647058823529411 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = 2 * min(a, d) - b - c + if num: + return num / (2 * min(a, d) + b + c) + return 0.0 + + def sim(self, src, tar): + """Return the Unknown E similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown E similarity + + Examples + -------- + >>> cmp = UnknownE() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36363636363636365 + >>> cmp.sim('aluminum', 'Catalan') + 0.11764705882352944 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_f.py b/abydos/distance/_unknown_f.py new file mode 100644 index 000000000..aad1168c1 --- /dev/null +++ b/abydos/distance/_unknown_f.py @@ -0,0 +1,229 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_f. + +Unknown F similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import log + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownF'] + + +class UnknownF(_TokenDistance): + r"""Unknown F similarity. + + For two sets X and Y and a population N, Unknown F similarity, which + :cite:`Choi:2010` attributes to :cite:`Gilbert:1966` but could not be + located in that source, is given as + + .. math:: + + sim(X, Y) = + log(|X \cap Y|) - log(|N|) - log\Big(\frac{|X|}{|N|}\Big) - + log\Big(\frac{|Y|}{|N|}\Big) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim = + log(a) - log(n) - log\Big(\frac{a+b}{n}\Big) - + log\Big(\frac{a+c}{n}\Big) + + This formula is not very normalizable, so the following formula is used + instead: + + .. math:: + + sim_{UnknownF}(X, Y) = + min\Bigg(1, 1+log\Big(\frac{|X \cap Y|}{|N|}\Big) - + \frac{1}{2}\Bigg(log\Big(\frac{|X|}{|N|}\Big) + + log\Big(\frac{|Y|}{|N|}\Big)\Bigg)\Bigg) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownF} = + min\Bigg(1, 1+log\Big(\frac{a}{n}\Big) - + \frac{1}{2}\Bigg(log\Big(\frac{a+b}{n}\Big) + + log\Big(\frac{a+c}{n}\Big)\Bigg)\Bigg) + + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownF instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownF, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Unknown F similarity between two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown F similarity + + Examples + -------- + >>> cmp = UnknownF() + >>> cmp.sim_score('cat', 'hat') + 0.3068528194400555 + >>> cmp.sim_score('Niall', 'Neil') + -0.007451510271132555 + >>> cmp.sim_score('aluminum', 'Catalan') + -1.1383330595080272 + >>> cmp.sim_score('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + n = self._population_unique_card() + + part1 = a / n + if part1 == 0: + part1 = 1 + + return min( + 1.0, 1 + log(part1) - (log((a + b) / n) + log((a + c) / n)) / 2 + ) + + def sim(self, *args, **kwargs): + """Raise exception when called. + + Parameters + ---------- + *args + Variable length argument list + **kwargs + Arbitrary keyword arguments + + Raises + ------ + NotImplementedError + Method disabled for Unknown F similarity + + + .. versionadded:: 0.4.0 + + """ + raise NotImplementedError('Method disabled for Unknown F similarity.') + + def dist(self, *args, **kwargs): + """Raise exception when called. + + Parameters + ---------- + *args + Variable length argument list + **kwargs + Arbitrary keyword arguments + + Raises + ------ + NotImplementedError + Method disabled for Unknown F similarity + + + .. versionadded:: 0.4.0 + + """ + raise NotImplementedError('Method disabled for Unknown F similarity.') + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_g.py b/abydos/distance/_unknown_g.py new file mode 100644 index 000000000..baae15d57 --- /dev/null +++ b/abydos/distance/_unknown_g.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_g. + +Unknown G similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownG'] + + +class UnknownG(_TokenDistance): + r"""Unknown G similarity. + + For two sets X and Y and a population N, Unknown G similarity, which + :cite:`Choi:2010` attributes to :cite:`Kulczynski:1927` but could not be + located in that source, is + + .. math:: + + sim_{UnknownG}(X, Y) = + \frac{\frac{|X \cap Y|}{2} \cdot (|X| + |Y|)} + {|X| \cdot |Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownG} = + \frac{\frac{a}{2} \cdot (2a+b+c)}{(a+b)(a+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize UnknownG instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownG, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Unknown G similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown G similarity + + Examples + -------- + >>> cmp = UnknownG() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36666666666666664 + >>> cmp.sim('aluminum', 'Catalan') + 0.11805555555555555 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + num = 0.5 * a * (2 * a + b + c) + if num: + return num / ((a + b) * (a + c)) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_h.py b/abydos/distance/_unknown_h.py new file mode 100644 index 000000000..4c3e7cffa --- /dev/null +++ b/abydos/distance/_unknown_h.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_p. + +Unknown H similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownH'] + + +class UnknownH(_TokenDistance): + r"""Unknown H similarity. + + For two sets X and Y and a population N, Unknown H similarity is a variant + of Fager-McGowan index of affinity :cite:`Fager:1957,Fager:1963`. It uses + minimum rather than maximum in the denominator of the second term, and is + sometimes misidentified as the Fager-McGown index of affinity + (cf. :cite:`Whittaker:1982`, for example). + + .. math:: + + sim_{UnknownH}(X, Y) = + \frac{|X \cap Y|}{\sqrt{|X|\cdot|Y|}} - + \frac{1}{2\sqrt{min(|X|, |Y|)}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownH} = + \frac{a}{\sqrt{(a+b)(a+c)}} - \frac{1}{2\sqrt{min(a+b, a+c)}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownH instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownH, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Unknown H similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown H similarity + + Examples + -------- + >>> cmp = UnknownH() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.14154157392013175 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + if not self._src_card() or not self._tar_card(): + return 0.0 + + a = self._intersection_card() + apb = self._src_card() + apc = self._tar_card() + + first = a / (apb * apc) ** 0.5 if a else 0.0 + second = 1 / (2 * (min(apb, apc) ** 0.5)) + + return first - second + + def sim(self, src, tar): + r"""Return the normalized Unknown H similarity of two strings. + + As this similarity ranges from :math:`(-\inf, 1.0)`, this normalization + simply clamps the value to the range (0.0, 1.0). + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Unknown H similarity + + Examples + -------- + >>> cmp = UnknownH() + >>> cmp.sim('cat', 'hat') + 0.25 + >>> cmp.sim('Niall', 'Neil') + 0.14154157392013175 + >>> cmp.sim('aluminum', 'Catalan') + 0.0 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return max(0.0, self.sim_score(src, tar)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_i.py b/abydos/distance/_unknown_i.py new file mode 100644 index 000000000..320d400d0 --- /dev/null +++ b/abydos/distance/_unknown_i.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_i. + +Unknown I similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownI'] + + +class UnknownI(_TokenDistance): + r"""Unknown I similarity. + + For two sets X and Y, the Unknown I similarity is based on + Mountford similarity :cite:`Mountford:1962` :class:`Mountford`. + + .. math:: + + sim_{UnknownI}(X, Y) = + \frac{2(|X \cap Y|+1)}{2((|X|+2)\cdot(|Y|+2))- + (|X|+|Y|+4)\cdot(|X \cap Y|+1)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownI} = + \frac{2(a+1)}{2(a+b+2)(a+c+2)-(2a+b+c+4)(a+1)} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize UnknownI instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownI, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Unknown I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown I similarity + + Examples + -------- + >>> cmp = UnknownI() + >>> cmp.sim('cat', 'hat') + 0.16666666666666666 + >>> cmp.sim('Niall', 'Neil') + 0.08955223880597014 + >>> cmp.sim('aluminum', 'Catalan') + 0.02247191011235955 + >>> cmp.sim('ATCG', 'TAGC') + 0.023809523809523808 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + 1 + b = self._src_only_card() + 1 + c = self._tar_only_card() + 1 + + return 2.0 * a / (c * (a + 2.0 * b) + a * b) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_j.py b/abydos/distance/_unknown_j.py new file mode 100644 index 000000000..c76ee888d --- /dev/null +++ b/abydos/distance/_unknown_j.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_j. + +Unknown J similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownJ'] + + +class UnknownJ(_TokenDistance): + r"""Unknown J similarity. + + For two sets X and Y and a population N, Unknown J similarity, which + :cite:`SequentiX:2018` attributes to "Kocher & Wang" but could not be + located, is + + .. math:: + + sim_{UnknownJ}(X, Y) = + |X \cap Y| \cdot \frac{|N|}{|X| \cdot |N \setminus X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownJ} = + a \cdot \frac{n}{(a+b)(c+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownJ instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownJ, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Unknown J similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown J similarity + + Examples + -------- + >>> cmp = UnknownJ() + >>> cmp.sim_score('cat', 'hat') + 0.5025641025641026 + >>> cmp.sim_score('Niall', 'Neil') + 0.33590402742073694 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.11239977090492555 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = max(1.0, self._total_complement_card()) + n = a + b + c + d + + an = a * n + if an: + return a * n / ((a + b) * (c + d)) + return 0.0 + + def sim(self, src, tar): + """Return the normalized Unknown J similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Unknown J similarity + + Examples + -------- + >>> cmp = UnknownJ() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.33333333333333337 + >>> cmp.sim('aluminum', 'Catalan') + 0.11111111111111112 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + score = self.sim_score(src, tar) + if score: + return score / max( + self.sim_score(src, src), self.sim_score(tar, tar) + ) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_k.py b/abydos/distance/_unknown_k.py new file mode 100644 index 000000000..688b03214 --- /dev/null +++ b/abydos/distance/_unknown_k.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_k. + +Unknown K distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownK'] + + +class UnknownK(_TokenDistance): + r"""Unknown K distance. + + For two sets X and Y and a population N, Unknown K distance, which + :cite:`SequentiX:2018` attributes to "Excoffier" but could not be + located, is + + .. math:: + + dist_{UnknownK}(X, Y) = + |N| \cdot (1 - \frac{|X \cap Y|}{|N|}) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{UnknownK} = + n \cdot (1 - \frac{a}{n}) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownK instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownK, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist_abs(self, src, tar): + """Return the Unknown K distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown K distance + + Examples + -------- + >>> cmp = UnknownK() + >>> cmp.dist_abs('cat', 'hat') + 782.0 + >>> cmp.dist_abs('Niall', 'Neil') + 782.0 + >>> cmp.dist_abs('aluminum', 'Catalan') + 784.0 + >>> cmp.dist_abs('ATCG', 'TAGC') + 784.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + n = self._population_unique_card() + + if not n: + return 0.0 + return n * (1 - a / n) + + def dist(self, src, tar): + """Return the normalized Unknown K distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Unknown K distance + + Examples + -------- + >>> cmp = UnknownK() + >>> cmp.dist('cat', 'hat') + 0.9974489795918368 + >>> cmp.dist('Niall', 'Neil') + 0.9974489795918368 + >>> cmp.dist('aluminum', 'Catalan') + 0.9987261146496815 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + score = self.dist_abs(src, tar) + norm = self._population_unique_card() + if score: + return score / norm + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_l.py b/abydos/distance/_unknown_l.py new file mode 100644 index 000000000..ad4bf0a71 --- /dev/null +++ b/abydos/distance/_unknown_l.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_l. + +Unknown L similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownL'] + + +class UnknownL(_TokenDistance): + r"""Unknown L similarity. + + For two sets X and Y and a population N, Unknown L similarity, which + :cite:`SequentiX:2018` attributes to "Roux" but could not be + located, is + + .. math:: + + sim_{UnknownL}(X, Y) = + \frac{|X \cap Y| + |(N \setminus X) \setminus Y|} + {min(|X \setminus Y|, |Y \setminus X|) + + min(|N|-|X \setminus Y|, |N|-|Y \setminus X|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownL} = + \frac{a+d}{min(b, c) + min(n-b, n-c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownL instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownL, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Unknown L similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown L similarity + + Examples + -------- + >>> cmp = UnknownL() + >>> cmp.sim('cat', 'hat') + 0.9948979591836735 + >>> cmp.sim('Niall', 'Neil') + 0.9923371647509579 + >>> cmp.sim('aluminum', 'Catalan') + 0.9821428571428571 + >>> cmp.sim('ATCG', 'TAGC') + 0.9872448979591837 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + if not self._src_card() or not self._tar_card(): + return 1.0 + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return (a + d) / (min(b, c) + min(n - b, n - c)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_unknown_m.py b/abydos/distance/_unknown_m.py new file mode 100644 index 000000000..a449b08da --- /dev/null +++ b/abydos/distance/_unknown_m.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._unknown_m. + +Unknown M similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['UnknownM'] + + +class UnknownM(_TokenDistance): + r"""Unknown M similarity. + + For two sets X and Y and a population N, Unknown < similarity, which + :cite:`SequentiX:2018` attributes to "Roux" but could not be + located, is + + .. math:: + + sim_{UnknownM}(X, Y) = + \frac{|N|-|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + {\sqrt{|X| \cdot |N \setminus X| \cdot |Y| \cdot |N \setminus Y|}} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{UnknownM} = + \frac{n-ad}{\sqrt{(a+b)(c+d)(a+c)(b+d)}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize UnknownM instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(UnknownM, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Unknown M similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Unknown M similarity + + Examples + -------- + >>> cmp = UnknownM() + >>> cmp.sim_score('cat', 'hat') + -0.24743589743589745 + >>> cmp.sim_score('Niall', 'Neil') + -0.17964271701223158 + >>> cmp.sim_score('aluminum', 'Catalan') + 0.0024283560516135103 + >>> cmp.sim_score('ATCG', 'TAGC') + 0.2012836970474968 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + return (n - a * d) / ( + max(1.0, a + b) + * max(1.0, c + d) + * max(1.0, a + c) + * max(1.0, b + d) + ) ** 0.5 + + def sim(self, src, tar): + """Return the normalized Unknown M similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Unknown M similarity + + Examples + -------- + >>> cmp = UnknownM() + >>> cmp.sim('cat', 'hat') + 0.6237179487179487 + >>> cmp.sim('Niall', 'Neil') + 0.5898213585061158 + >>> cmp.sim('aluminum', 'Catalan') + 0.49878582197419324 + >>> cmp.sim('ATCG', 'TAGC') + 0.3993581514762516 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 - self.sim_score(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_upholt.py b/abydos/distance/_upholt.py new file mode 100644 index 000000000..e3967741d --- /dev/null +++ b/abydos/distance/_upholt.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._upholt. + +Upholt similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Upholt'] + + +class Upholt(_TokenDistance): + r"""Upholt similarity. + + For two sets X and Y and a population N, Upholt similarity, Upholt's S, + :cite:`Upholt:1977` is + + .. math:: + + sim_{Upholt}(X, Y) = + \frac{1}{2}\Bigg(-\frac{2 \cdot |X \cap Y|}{|X| + |Y|} + + \sqrt{\Big(\frac{2 \cdot |X \cap Y|}{|X| + |Y|}\Big)^2 + + 8\frac{2 \cdot |X \cap Y|}{|X| + |Y|}}\Bigg) + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Upholt}(X, Y) = + \frac{1}{2}\Bigg(-\frac{2a}{2a+b+c} + + \sqrt{\Big(\frac{2a}{2a+b+c}\Big)^2 + + 8\frac{2a}{2a+b+c}}\Bigg) + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize Upholt instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(Upholt, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Upholt similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Upholt similarity + + Examples + -------- + >>> cmp = Upholt() + >>> cmp.sim('cat', 'hat') + 0.7807764064044151 + >>> cmp.sim('Niall', 'Neil') + 0.6901511860568581 + >>> cmp.sim('aluminum', 'Catalan') + 0.42980140370106323 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + f = 2 * a / (2 * a + b + c) + + return (-f + ((8 + f) * f) ** 0.5) / 2 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_warrens_i.py b/abydos/distance/_warrens_i.py new file mode 100644 index 000000000..4c8968a4f --- /dev/null +++ b/abydos/distance/_warrens_i.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._warrens_i. + +Warrens I correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['WarrensI'] + + +class WarrensI(_TokenDistance): + r"""Warrens I correlation. + + For two sets X and Y, Warrens I correlation :math:`S_{NS1}` + :cite:`Warrens:2008` is + + .. math:: + + corr_{WarrensI}(X, Y) = + \frac{2|X \cap Y| - |X \setminus Y| - |Y \setminus X|} + {2|X \cap Y| + |X \setminus Y| + |Y \setminus X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{WarrensI} = + \frac{2a-b-c}{2a+b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): + """Initialize WarrensI instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(WarrensI, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def corr(self, src, tar): + """Return the Warrens I correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Warrens I correlation + + Examples + -------- + >>> cmp = WarrensI() + >>> cmp.corr('cat', 'hat') + 0.0 + >>> cmp.corr('Niall', 'Neil') + -0.2727272727272727 + >>> cmp.corr('aluminum', 'Catalan') + -0.7647058823529411 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return (2 * a - b - c) / (2 * a + b + c) + + def sim(self, src, tar): + """Return the Warrens I similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Warrens I similarity + + Examples + -------- + >>> cmp = WarrensI() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.36363636363636365 + >>> cmp.sim('aluminum', 'Catalan') + 0.11764705882352944 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_warrens_ii.py b/abydos/distance/_warrens_ii.py new file mode 100644 index 000000000..e49a7d536 --- /dev/null +++ b/abydos/distance/_warrens_ii.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._warrens_ii. + +Warrens II similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['WarrensII'] + + +class WarrensII(_TokenDistance): + r"""Warrens II similarity. + + For two sets X and Y and a population N, Warrens II similarity + :math:`S_{NS2}` :cite:`Warrens:2008` is + + .. math:: + + sim_{WarrensII}(X, Y) = + \frac{2|(N \setminus X) \setminus Y|} + {|N \setminus X| + |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{WarrensII} = + \frac{2d}{b+c+2d} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize WarrensII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(WarrensII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Warrens II similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Warrens II similarity + + Examples + -------- + >>> cmp = WarrensII() + >>> cmp.sim('cat', 'hat') + 0.9974358974358974 + >>> cmp.sim('Niall', 'Neil') + 0.9955041746949261 + >>> cmp.sim('aluminum', 'Catalan') + 0.9903412749517064 + >>> cmp.sim('ATCG', 'TAGC') + 0.993581514762516 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if d: + return 2 * d / (b + c + 2 * d) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_warrens_iii.py b/abydos/distance/_warrens_iii.py new file mode 100644 index 000000000..46e2efd1f --- /dev/null +++ b/abydos/distance/_warrens_iii.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._warrens_iii. + +Warrens III correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['WarrensIII'] + + +class WarrensIII(_TokenDistance): + r"""Warrens III correlation. + + For two sets X and Y and a population N, Warrens III correlation + :math:`S_{NS3}` :cite:`Warrens:2008` is + + .. math:: + + corr_{WarrensIII}(X, Y) = + \frac{2|(N \setminus X) \setminus Y| - |X \setminus Y| - + |Y \setminus X|}{|N \setminus X| + |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{WarrensIII} = + \frac{2d-b-c}{2d+b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize WarrensIII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(WarrensIII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return the Warrens III correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Warrens III correlation + + Examples + -------- + >>> cmp = WarrensIII() + >>> cmp.corr('cat', 'hat') + 0.9948717948717949 + >>> cmp.corr('Niall', 'Neil') + 0.9910083493898523 + >>> cmp.corr('aluminum', 'Catalan') + 0.9806825499034127 + >>> cmp.corr('ATCG', 'TAGC') + 0.9871630295250321 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = 2 * d - b - c + if num: + return num / (2 * d + b + c) + return 0.0 + + def sim(self, src, tar): + """Return the Warrens III similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Warrens III similarity + + Examples + -------- + >>> cmp = WarrensIII() + >>> cmp.sim('cat', 'hat') + 0.9974358974358974 + >>> cmp.sim('Niall', 'Neil') + 0.9955041746949261 + >>> cmp.sim('aluminum', 'Catalan') + 0.9903412749517064 + >>> cmp.sim('ATCG', 'TAGC') + 0.993581514762516 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_warrens_iv.py b/abydos/distance/_warrens_iv.py new file mode 100644 index 000000000..0472e41ff --- /dev/null +++ b/abydos/distance/_warrens_iv.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._warrens_iv. + +Warrens IV similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['WarrensIV'] + + +class WarrensIV(_TokenDistance): + r"""Warrens IV similarity. + + For two sets X and Y and a population N, Warrens IV similarity + :cite:`Warrens:2008` is + + .. math:: + + sim_{WarrensIV}(X, Y) = + \frac{4|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + {4|X \cap Y| \cdot |(N \setminus X) \setminus Y| + + (|X \cap Y| + |(N \setminus X) \setminus Y|) + (|X \setminus Y| + |Y \setminus X|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{WarrensIV} = + \frac{4ad}{4ad + (a+d)(b+c)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize WarrensIV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(WarrensIV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim(self, src, tar): + """Return the Warrens IV similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Warrens IV similarity + + Examples + -------- + >>> cmp = WarrensIV() + >>> cmp.sim('cat', 'hat') + 0.666095890410959 + >>> cmp.sim('Niall', 'Neil') + 0.5326918120113412 + >>> cmp.sim('aluminum', 'Catalan') + 0.21031040612607685 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + atd = a * d + if atd: + return (4 * atd) / (4 * atd + (a + d) * (b + c)) + return 0.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_warrens_v.py b/abydos/distance/_warrens_v.py new file mode 100644 index 000000000..67ccd602a --- /dev/null +++ b/abydos/distance/_warrens_v.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._warrens_v. + +Warrens V similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['WarrensV'] + + +class WarrensV(_TokenDistance): + r"""Warrens V similarity. + + For two sets X and Y and a population N, Warrens V similarity + :cite:`Warrens:2008` is + + .. math:: + + sim_{WarrensV}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {min(|X| \cdot |Y|, |N \setminus X| \cdot |N \setminus Y|)} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{WarrensV} = + \frac{ad-bc}{min( (a+b)(a+c), (b+d)(c+d) )} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize WarrensV instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(WarrensV, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar): + """Return the Warrens V similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Warrens V similarity + + Examples + -------- + >>> cmp = WarrensV() + >>> cmp.sim_score('cat', 'hat') + 97.0 + >>> cmp.sim_score('Niall', 'Neil') + 51.266666666666666 + >>> cmp.sim_score('aluminum', 'Catalan') + 9.902777777777779 + >>> cmp.sim_score('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + num = a * d - b * c + if num: + return num / min((a + b) * (a + c), (b + d) * (c + d)) + return 0.0 + + def sim(self, src, tar): + """Return the normalized Warrens V similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Warrens V similarity + + Examples + -------- + >>> cmp = WarrensV() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.3333333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.11125283446712018 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + score = self.sim_score(src, tar) + if not score: + return 0.0 + + norm = max(self.sim_score(src, src), self.sim_score(tar, tar)) + + return (1.0 + score) / (1.0 + norm) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_weighted_jaccard.py b/abydos/distance/_weighted_jaccard.py new file mode 100644 index 000000000..ada1d77e3 --- /dev/null +++ b/abydos/distance/_weighted_jaccard.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._weighted_jaccard. + +Weighted Jaccard similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['WeightedJaccard'] + + +class WeightedJaccard(_TokenDistance): + r"""Weighted Jaccard similarity. + + For two sets X and Y and a weight w, the Weighted Jaccard similarity + :cite:`Legendre:1998` is + + .. math:: + + sim_{Jaccard_w}(X, Y) = \frac{w \cdot |X \cap Y|} + {w \cdot |X \cap Y| + |X \setminus Y| + |Y \setminus X|} + + Here, the intersection between the two sets is weighted by w. Compare to + Jaccard similarity (:math:`w = 1`), and to Dice similarity (:math:`w = 2`). + In the default case, the weight of the intersection is 3, following + :cite:`Legendre:1998`. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Jaccard_w} = + \frac{w\cdot a}{w\cdot a+b+c} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, tokenizer=None, intersection_type='crisp', weight=3, **kwargs + ): + """Initialize TripleWeightedJaccard instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + weight : int + The weight to apply to the intersection cardinality. (3, by + default.) + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + self.weight = weight + super(WeightedJaccard, self).__init__( + tokenizer=tokenizer, intersection_type=intersection_type, **kwargs + ) + + def sim(self, src, tar): + """Return the Triple Weighted Jaccard similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Weighted Jaccard similarity + + Examples + -------- + >>> cmp = WeightedJaccard() + >>> cmp.sim('cat', 'hat') + 0.6 + >>> cmp.sim('Niall', 'Neil') + 0.46153846153846156 + >>> cmp.sim('aluminum', 'Catalan') + 0.16666666666666666 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + + return self.weight * a / (self.weight * a + b + c) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_whittaker.py b/abydos/distance/_whittaker.py new file mode 100644 index 000000000..8899f1c3b --- /dev/null +++ b/abydos/distance/_whittaker.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._whittaker. + +Whittaker distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['Whittaker'] + + +class Whittaker(_TokenDistance): + r"""Whittaker distance. + + For two multisets X and Y drawn from an alphabet S, Whittaker distance + :cite:`Whittaker:1952` is + + .. math:: + + sim_{Whittaker}(X, Y) = 1 - + \frac{1}{2}\sum_{i \in S} \Bigg| \frac{|X_i|}{|X|} - + \frac{|Y_i|}{|Y|} \Bigg| + + .. versionadded:: 0.4.0 + """ + + def __init__(self, tokenizer=None, **kwargs): + """Initialize Whittaker instance. + + Parameters + ---------- + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + + + .. versionadded:: 0.4.0 + + """ + super(Whittaker, self).__init__(tokenizer=tokenizer, **kwargs) + + def sim(self, src, tar): + """Return the Whittaker distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Whittaker distance + + Examples + -------- + >>> cmp = Whittaker() + >>> cmp.sim('cat', 'hat') + 0.5 + >>> cmp.sim('Niall', 'Neil') + 0.33333333333333 + >>> cmp.sim('aluminum', 'Catalan') + 0.11111111111111 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + alphabet = self._total().keys() + + src_card = self._src_card() + tar_card = self._tar_card() + + return float( + round( + 1 + - 0.5 + * sum( + abs( + (self._src_tokens[tok] / src_card if src_card else 0.0) + - ( + self._tar_tokens[tok] / tar_card + if tar_card + else 0.0 + ) + ) + for tok in alphabet + ), + 14, + ) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_yates_chi_squared.py b/abydos/distance/_yates_chi_squared.py new file mode 100644 index 000000000..f9da5db3f --- /dev/null +++ b/abydos/distance/_yates_chi_squared.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._yates_chi_squared. + +Yates's Chi-Squared similarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import copysign + +from ._token_distance import _TokenDistance + +__all__ = ['YatesChiSquared'] + + +class YatesChiSquared(_TokenDistance): + r"""Yates's Chi-Squared similarity. + + For two sets X and Y and a population N, Yates's :math:`\chi^2` similarity + :cite:`Yates:1934` is + + .. math:: + + sim_{Yates_{\chi^2}}(X, Y) = + \frac{|N| \cdot (||X \cap Y| \cdot + |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|| - + \frac{|N|}{2})^2} + {|X| \cdot |N \setminus X| \cdot |Y| \cdot + |N \setminus Y|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + sim_{Yates_{\chi^2}} = + \frac{n \cdot (|ad-bc| - \frac{n}{2})^2}{(a+b)(c+d)(a+c)(b+d)} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize YatesChiSquared instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(YatesChiSquared, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def sim_score(self, src, tar, signed=False): + """Return Yates's Chi-Squared similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + signed : bool + If True, negative correlations will carry a negative sign + + Returns + ------- + float + Yates's Chi-Squared similarity + + Examples + -------- + >>> cmp = YatesChiSquared() + >>> cmp.sim_score('cat', 'hat') + 108.37343852728468 + >>> cmp.sim_score('Niall', 'Neil') + 56.630055670871954 + >>> cmp.sim_score('aluminum', 'Catalan') + 1.8574215841854373 + >>> cmp.sim_score('ATCG', 'TAGC') + 6.960385076156687 + + + .. versionadded:: 0.4.0 + + """ + if not src or not tar: + return 0.0 + + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + n = self._population_unique_card() + + admbc = a * d - b * c + num = n * (abs(admbc) - n / 2) ** 2 + denom = ( + max(1, (a + b)) + * max(1, (c + d)) + * max(1, (a + c)) + * max(1, (b + d)) + ) + if num: + score = num / denom + if signed: + score = copysign(score, admbc) + return score + return 0.0 + + def sim(self, src, tar): + """Return Yates's normalized Chi-Squared similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Yates's Chi-Squared similarity + + Examples + -------- + >>> cmp = YatesChiSquared() + >>> cmp.sim('cat', 'hat') + 0.18081199852082455 + >>> cmp.sim('Niall', 'Neil') + 0.08608296705052738 + >>> cmp.sim('aluminum', 'Catalan') + 0.0026563223707532654 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 1.0 + if not src or not tar: + return 0.0 + score = self.sim_score(src, tar, signed=True) + if score < 0: + return 0.0 + norm = max(self.sim_score(src, src), self.sim_score(tar, tar)) + return score / norm + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_yjhhr.py b/abydos/distance/_yjhhr.py new file mode 100644 index 000000000..412c65c7f --- /dev/null +++ b/abydos/distance/_yjhhr.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._yjhhr. + +YJHHR distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['YJHHR'] + + +class YJHHR(_TokenDistance): + r"""YJHHR distance. + + For two sets X and Y and a parameter p, YJHHR distance + :cite:`Yang:2016` is + + .. math:: + + dist_{YJHHR_p}(X, Y) = + \sqrt[p]{|X \setminus Y|^p + |Y \setminus X|^p} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{YJHHR} = + \sqrt[p]{b^p + c^p} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + pval=1, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize YJHHR instance. + + Parameters + ---------- + pval : int + The :math:`p`-value of the :math:`L^p`-space + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(YJHHR, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + self.set_params(pval=pval) + + def dist_abs(self, src, tar): + """Return the YJHHR distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + YJHHR distance + + Examples + -------- + >>> cmp = YJHHR() + >>> cmp.dist_abs('cat', 'hat') + 4.0 + >>> cmp.dist_abs('Niall', 'Neil') + 7.0 + >>> cmp.dist_abs('aluminum', 'Catalan') + 15.0 + >>> cmp.dist_abs('ATCG', 'TAGC') + 10.0 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + self._tokenize(src, tar) + + b = self._src_only_card() ** self.params['pval'] + c = self._tar_only_card() ** self.params['pval'] + + return float(round((b + c) ** (1 / self.params['pval']), 14)) + + def dist(self, src, tar): + """Return the normalized YJHHR distance of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + normalized YJHHR distance + + Examples + -------- + >>> cmp = YJHHR() + >>> cmp.dist('cat', 'hat') + 0.6666666666666666 + >>> cmp.dist('Niall', 'Neil') + 0.7777777777777778 + >>> cmp.dist('aluminum', 'Catalan') + 0.9375 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + distance = self.dist_abs(src, tar) + union = self._union_card() + if union == 0: + return 0.0 + return distance / union + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_yujian_bo.py b/abydos/distance/_yujian_bo.py new file mode 100644 index 000000000..1fc1a2944 --- /dev/null +++ b/abydos/distance/_yujian_bo.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._yujian_bo. + +Yujian-Bo normalized Levenshtein distance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._levenshtein import Levenshtein + +__all__ = ['YujianBo'] + + +class YujianBo(Levenshtein): + r"""Yujian-Bo normalized Levenshtein distance. + + Yujian-Bo's normalization of Levenshtein distance :cite:`Yujian:2007`, + given Levenshtein distance :math:`GLD(X, Y)` between two strings X and Y, + is + + .. math:: + + dist_{N-GLD}(X, Y) = + \frac{2 \cdot GLD(X, Y)}{|X| + |Y| + GLD(X, Y)} + + .. versionadded:: 0.4.0 + """ + + def __init__(self, cost=(1, 1, 1, 1), **kwargs): + """Initialize YujianBo instance. + + Parameters + ---------- + **kwargs + Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + super(YujianBo, self).__init__(cost=cost, **kwargs) + + def dist_abs(self, src, tar): + """Return the Yujian-Bo normalized edit distance between two strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + int + The Yujian-Bo normalized edit distance between src & tar + + Examples + -------- + >>> cmp = YujianBo() + >>> cmp.dist_abs('cat', 'hat') + 0.2857142857142857 + >>> cmp.dist_abs('Niall', 'Neil') + 0.5 + >>> cmp.dist_abs('aluminum', 'Catalan') + 0.6363636363636364 + >>> cmp.dist_abs('ATCG', 'TAGC') + 0.5454545454545454 + + + .. versionadded:: 0.4.0 + + """ + return self.dist(src, tar) + + def dist(self, src, tar): + """Return the Yujian-Bo normalized edit distance between strings. + + Parameters + ---------- + src : str + Source string for comparison + tar : str + Target string for comparison + + Returns + ------- + float + The Yujian-Bo normalized edit distance between src & tar + + Examples + -------- + >>> cmp = YujianBo() + >>> round(cmp.dist('cat', 'hat'), 12) + 0.285714285714 + >>> round(cmp.dist('Niall', 'Neil'), 12) + 0.5 + >>> cmp.dist('aluminum', 'Catalan') + 0.6363636363636364 + >>> cmp.dist('ATCG', 'TAGC') + 0.5454545454545454 + + + .. versionadded:: 0.4.0 + + """ + if src == tar: + return 0.0 + + ins_cost, del_cost = self._cost[:2] + gld = super(YujianBo, self).dist_abs(src, tar) + return 2 * gld / (len(src) * del_cost + len(tar) * ins_cost + gld) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_yule_q.py b/abydos/distance/_yule_q.py new file mode 100644 index 000000000..46eb33275 --- /dev/null +++ b/abydos/distance/_yule_q.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._yule_q. + +Yule's Q correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['YuleQ'] + + +class YuleQ(_TokenDistance): + r"""Yule's Q correlation. + + For two sets X and Y and a population N, Yule's Q correlation + :cite:`Yule:1912` is + + .. math:: + + corr_{Yule_Q}(X, Y) = + \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - + |X \setminus Y| \cdot |Y \setminus X|} + {|X \cap Y| \cdot |(N \setminus X) \setminus Y| + + |X \setminus Y| \cdot |Y \setminus X|} + + Yule himself terms this the coefficient of association. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Yule_Q} = + \frac{ad-bc}{ad+bc} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize YuleQ instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(YuleQ, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return Yule's Q correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Yule's Q correlation + + Examples + -------- + >>> cmp = YuleQ() + >>> cmp.corr('cat', 'hat') + 0.9948717948717949 + >>> cmp.corr('Niall', 'Neil') + 0.9846350832266325 + >>> cmp.corr('aluminum', 'Catalan') + 0.8642424242424243 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + admbc = a * d - b * c + if admbc: + return admbc / (a * d + b * c) + return 0.0 + + def sim(self, src, tar): + """Return Yule's Q similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Yule's Q similarity + + Examples + -------- + >>> cmp = YuleQ() + >>> cmp.sim('cat', 'hat') + 0.9974358974358974 + >>> cmp.sim('Niall', 'Neil') + 0.9923175416133163 + >>> cmp.sim('aluminum', 'Catalan') + 0.9321212121212121 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_yule_q_ii.py b/abydos/distance/_yule_q_ii.py new file mode 100644 index 000000000..342a2c5f3 --- /dev/null +++ b/abydos/distance/_yule_q_ii.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._yule_q_ii. + +Yule's Q dissimilarity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['YuleQII'] + + +class YuleQII(_TokenDistance): + r"""Yule's Q dissimilarity. + + For two sets X and Y and a population N, Yule's Q dissimilarity + :cite:`Yule:1968` is + + .. math:: + + dist_{Yule_QII}(X, Y) = + \frac{2 \cdot |X \setminus Y| \cdot |Y \setminus X|} + {|X \cap Y| \cdot |(N \setminus X) \setminus Y| + + |X \setminus Y| \cdot |Y \setminus X|} + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + dist_{Yule_QII} = + \frac{2bc}{ad+bc} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize YuleQII instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(YuleQII, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def dist_abs(self, src, tar): + """Return Yule's Q dissimilarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Yule's Q II distance + + Examples + -------- + >>> cmp = YuleQII() + >>> cmp.dist_abs('cat', 'hat') + 0.005128205128205128 + >>> cmp.dist_abs('Niall', 'Neil') + 0.015364916773367477 + >>> cmp.dist_abs('aluminum', 'Catalan') + 0.13575757575757577 + >>> cmp.dist_abs('ATCG', 'TAGC') + 2.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + if not b or not c: + return 0.0 + return (2 * b * c) / (a * d + b * c) + + def dist(self, src, tar): + """Return normalized Yule's Q dissimilarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Normalized Yule's Q II distance + + Examples + -------- + >>> cmp = YuleQII() + >>> cmp.dist('cat', 'hat') + 0.002564102564102564 + >>> cmp.dist('Niall', 'Neil') + 0.0076824583866837385 + >>> cmp.dist('aluminum', 'Catalan') + 0.06787878787878789 + >>> cmp.dist('ATCG', 'TAGC') + 1.0 + + + .. versionadded:: 0.4.0 + + """ + return self.dist_abs(src, tar) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/distance/_yule_y.py b/abydos/distance/_yule_y.py new file mode 100644 index 000000000..7b6b498a0 --- /dev/null +++ b/abydos/distance/_yule_y.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.distance._yule_y. + +Yule's Y correlation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._token_distance import _TokenDistance + +__all__ = ['YuleY'] + + +class YuleY(_TokenDistance): + r"""Yule's Y correlation. + + For two sets X and Y and a population N, Yule's Y correlation + :cite:`Yule:1912` is + + .. math:: + + corr_{Yule_Y}(X, Y) = + \frac{\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} - + \sqrt{|X \setminus Y| \cdot |Y \setminus X|}} + {\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} + + \sqrt{|X \setminus Y| \cdot |Y \setminus X|}} + + In :cite:`Yule:1912`, this is labeled :math:`\omega`, so it is sometimes + referred to as Yule's :math:`\omega`. Yule himself terms this the + coefficient of colligation. + + In :ref:`2x2 confusion table terms `, where a+b+c+d=n, + this is + + .. math:: + + corr_{Yule_Y} = + \frac{\sqrt{ad}-\sqrt{bc}}{\sqrt{ad}+\sqrt{bc}} + + .. versionadded:: 0.4.0 + """ + + def __init__( + self, + alphabet=None, + tokenizer=None, + intersection_type='crisp', + **kwargs + ): + """Initialize YuleY instance. + + Parameters + ---------- + alphabet : Counter, collection, int, or None + This represents the alphabet of possible tokens. + See :ref:`alphabet ` description in + :py:class:`_TokenDistance` for details. + tokenizer : _Tokenizer + A tokenizer instance from the :py:mod:`abydos.tokenizer` package + intersection_type : str + Specifies the intersection type, and set type as a result: + See :ref:`intersection_type ` description in + :py:class:`_TokenDistance` for details. + **kwargs + Arbitrary keyword arguments + + Other Parameters + ---------------- + qval : int + The length of each q-gram. Using this parameter and tokenizer=None + will cause the instance to use the QGram tokenizer with this + q value. + metric : _Distance + A string distance measure class for use in the ``soft`` and + ``fuzzy`` variants. + threshold : float + A threshold value, similarities above which are counted as + members of the intersection for the ``fuzzy`` variant. + + + .. versionadded:: 0.4.0 + + """ + super(YuleY, self).__init__( + alphabet=alphabet, + tokenizer=tokenizer, + intersection_type=intersection_type, + **kwargs + ) + + def corr(self, src, tar): + """Return Yule's Y correlation of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Yule's Y correlation + + Examples + -------- + >>> cmp = YuleY() + >>> cmp.corr('cat', 'hat') + 0.9034892632818762 + >>> cmp.corr('Niall', 'Neil') + 0.8382551144735259 + >>> cmp.corr('aluminum', 'Catalan') + 0.5749826820237787 + >>> cmp.corr('ATCG', 'TAGC') + -1.0 + + + .. versionadded:: 0.4.0 + + """ + self._tokenize(src, tar) + + a = self._intersection_card() + b = self._src_only_card() + c = self._tar_only_card() + d = self._total_complement_card() + + admbc = (a * d) ** 0.5 - (b * c) ** 0.5 + if admbc: + return admbc / ((a * d) ** 0.5 + (b * c) ** 0.5) + return 0.0 + + def sim(self, src, tar): + """Return Yule's Y similarity of two strings. + + Parameters + ---------- + src : str + Source string (or QGrams/Counter objects) for comparison + tar : str + Target string (or QGrams/Counter objects) for comparison + + Returns + ------- + float + Yule's Y similarity + + Examples + -------- + >>> cmp = YuleY() + >>> cmp.sim('cat', 'hat') + 0.9517446316409381 + >>> cmp.sim('Niall', 'Neil') + 0.919127557236763 + >>> cmp.sim('aluminum', 'Catalan') + 0.7874913410118893 + >>> cmp.sim('ATCG', 'TAGC') + 0.0 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 + self.corr(src, tar)) / 2.0 + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/fingerprint/__init__.py b/abydos/fingerprint/__init__.py index 9fead2b2d..b7ddcc4c0 100644 --- a/abydos/fingerprint/__init__.py +++ b/abydos/fingerprint/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -70,6 +70,7 @@ MOST_COMMON_LETTERS_DE, MOST_COMMON_LETTERS_DE_LC, MOST_COMMON_LETTERS_EN_LC, + _Fingerprint, ) from ._occurrence import Occurrence, occurrence_fingerprint from ._occurrence_halved import OccurrenceHalved, occurrence_halved_fingerprint @@ -79,9 +80,10 @@ from ._qgram import QGram, qgram_fingerprint from ._skeleton_key import SkeletonKey, skeleton_key from ._string import String, str_fingerprint -from ._synoname import SynonameToolcode, synoname_toolcode +from ._synoname_toolcode import SynonameToolcode, synoname_toolcode __all__ = [ + '_Fingerprint', 'String', 'str_fingerprint', 'QGram', diff --git a/abydos/fingerprint/_count.py b/abydos/fingerprint/_count.py index 9df4ca041..917406109 100644 --- a/abydos/fingerprint/_count.py +++ b/abydos/fingerprint/_count.py @@ -30,7 +30,10 @@ from collections import Counter +from deprecation import deprecated + from ._fingerprint import MOST_COMMON_LETTERS_CG, _Fingerprint +from .. import __version__ __all__ = ['Count', 'count_fingerprint'] @@ -39,20 +42,36 @@ class Count(_Fingerprint): """Count Fingerprint. Based on the count fingerprint from :cite:`Cislak:2017`. + + .. versionadded:: 0.3.6 """ - def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): - """Return the count fingerprint. + def __init__(self, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): + """Initialize Count instance. Parameters ---------- - word : str - The word to fingerprint n_bits : int Number of bits in the fingerprint returned most_common : list The most common tokens in the target language, ordered by frequency + + .. versionadded:: 0.4.0 + + """ + super(_Fingerprint, self).__init__() + self._n_bits = n_bits + self._most_common = most_common + + def fingerprint(self, word): + """Return the count fingerprint. + + Parameters + ---------- + word : str + The word to fingerprint + Returns ------- int @@ -72,14 +91,20 @@ def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): >>> bin(cf.fingerprint('entreatment')) '0b1111010000100000' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ + n_bits = self._n_bits if n_bits % 2: n_bits += 1 word = Counter(word) fingerprint = 0 - for letter in most_common: + for letter in self._most_common: if n_bits: fingerprint <<= 2 fingerprint += word[letter] & 3 @@ -93,6 +118,12 @@ def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): return fingerprint +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Count.fingerprint method instead.', +) def count_fingerprint(word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): """Return the count fingerprint. @@ -125,8 +156,10 @@ def count_fingerprint(word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): >>> bin(count_fingerprint('entreatment')) '0b1111010000100000' + .. versionadded:: 0.3.0 + """ - return Count().fingerprint(word, n_bits, most_common) + return Count(n_bits, most_common).fingerprint(word) if __name__ == '__main__': diff --git a/abydos/fingerprint/_fingerprint.py b/abydos/fingerprint/_fingerprint.py index fee7d79f7..5c371413a 100644 --- a/abydos/fingerprint/_fingerprint.py +++ b/abydos/fingerprint/_fingerprint.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -29,6 +29,8 @@ unicode_literals, ) +__all__ = ['_Fingerprint'] + # fmt: off # most common letters, as defined in Cisłak & Grabowski MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd', @@ -71,7 +73,10 @@ class _Fingerprint(object): - """Abstract _Fingerprint class.""" + """Abstract _Fingerprint class. + + .. versionadded:: 0.3.6 + """ def fingerprint(self, word): """Fingerprint string. @@ -81,6 +86,9 @@ def fingerprint(self, word): word : str Word to fingerprint + + .. versionadded:: 0.3.6 + """ pass diff --git a/abydos/fingerprint/_occurrence.py b/abydos/fingerprint/_occurrence.py index c68bf8b66..ef4308d5f 100644 --- a/abydos/fingerprint/_occurrence.py +++ b/abydos/fingerprint/_occurrence.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._fingerprint import MOST_COMMON_LETTERS_CG, _Fingerprint +from .. import __version__ __all__ = ['Occurrence', 'occurrence_fingerprint'] @@ -37,20 +40,36 @@ class Occurrence(_Fingerprint): """Occurrence Fingerprint. Based on the occurrence fingerprint from :cite:`Cislak:2017`. + + .. versionadded:: 0.3.6 """ - def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): - """Return the occurrence fingerprint. + def __init__(self, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): + """Initialize Count instance. Parameters ---------- - word : str - The word to fingerprint n_bits : int Number of bits in the fingerprint returned most_common : list The most common tokens in the target language, ordered by frequency + + .. versionadded:: 0.4.0 + + """ + super(_Fingerprint, self).__init__() + self._n_bits = n_bits + self._most_common = most_common + + def fingerprint(self, word): + """Return the occurrence fingerprint. + + Parameters + ---------- + word : str + The word to fingerprint + Returns ------- int @@ -70,11 +89,17 @@ def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): >>> bin(of.fingerprint('entreatment')) '0b1110010010000100' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ + n_bits = self._n_bits word = set(word) fingerprint = 0 - for letter in most_common: + for letter in self._most_common: if letter in word: fingerprint += 1 n_bits -= 1 @@ -90,6 +115,12 @@ def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): return fingerprint +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Occurrence.fingerprint method instead.', +) def occurrence_fingerprint( word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG ): @@ -124,8 +155,10 @@ def occurrence_fingerprint( >>> bin(occurrence_fingerprint('entreatment')) '0b1110010010000100' + .. versionadded:: 0.3.0 + """ - return Occurrence().fingerprint(word, n_bits, most_common) + return Occurrence(n_bits, most_common).fingerprint(word) if __name__ == '__main__': diff --git a/abydos/fingerprint/_occurrence_halved.py b/abydos/fingerprint/_occurrence_halved.py index be64c1c9e..9e65c5a42 100644 --- a/abydos/fingerprint/_occurrence_halved.py +++ b/abydos/fingerprint/_occurrence_halved.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._fingerprint import MOST_COMMON_LETTERS_CG, _Fingerprint +from .. import __version__ __all__ = ['OccurrenceHalved', 'occurrence_halved_fingerprint'] @@ -37,9 +40,29 @@ class OccurrenceHalved(_Fingerprint): """Occurrence Halved Fingerprint. Based on the occurrence halved fingerprint from :cite:`Cislak:2017`. + + .. versionadded:: 0.3.6 """ - def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): + def __init__(self, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): + """Initialize Count instance. + + Parameters + ---------- + n_bits : int + Number of bits in the fingerprint returned + most_common : list + The most common tokens in the target language, ordered by frequency + + + .. versionadded:: 0.4.0 + + """ + super(_Fingerprint, self).__init__() + self._n_bits = n_bits + self._most_common = most_common + + def fingerprint(self, word): """Return the occurrence halved fingerprint. Based on the occurrence halved fingerprint from :cite:`Cislak:2017`. @@ -72,7 +95,13 @@ def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): >>> bin(ohf.fingerprint('entreatment')) '0b1111010000110000' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ + n_bits = self._n_bits if n_bits % 2: n_bits += 1 @@ -81,7 +110,7 @@ def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): w_2 = set(word[w_len:]) fingerprint = 0 - for letter in most_common: + for letter in self._most_common: if n_bits: fingerprint <<= 1 if letter in w_1: @@ -99,6 +128,12 @@ def fingerprint(self, word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): return fingerprint +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the OccurrenceHalved.fingerprint method instead.', +) def occurrence_halved_fingerprint( word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG ): @@ -133,8 +168,10 @@ def occurrence_halved_fingerprint( >>> bin(occurrence_halved_fingerprint('entreatment')) '0b1111010000110000' + .. versionadded:: 0.3.0 + """ - return OccurrenceHalved().fingerprint(word, n_bits, most_common) + return OccurrenceHalved(n_bits, most_common).fingerprint(word) if __name__ == '__main__': diff --git a/abydos/fingerprint/_omission_key.py b/abydos/fingerprint/_omission_key.py index 633d1086c..2e0ca805e 100644 --- a/abydos/fingerprint/_omission_key.py +++ b/abydos/fingerprint/_omission_key.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._fingerprint import _Fingerprint +from .. import __version__ __all__ = ['OmissionKey', 'omission_key'] @@ -41,6 +44,8 @@ class OmissionKey(_Fingerprint): """Omission Key. The omission key of a word is defined in :cite:`Pollock:1984`. + + .. versionadded:: 0.3.6 """ _consonants = tuple('JKQXZVWYBFMGPDHCLNTSR') @@ -69,6 +74,11 @@ def fingerprint(self, word): >>> ok.fingerprint('Niall') 'LNIA' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._letters) @@ -88,6 +98,12 @@ def fingerprint(self, word): return key +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the OmissionKey.fingerprint method instead.', +) def omission_key(word): """Return the omission key. @@ -112,6 +128,8 @@ def omission_key(word): >>> omission_key('Niall') 'LNIA' + .. versionadded:: 0.1.0 + """ return OmissionKey().fingerprint(word) diff --git a/abydos/fingerprint/_phonetic.py b/abydos/fingerprint/_phonetic.py index db447dbd2..0236f48f3 100644 --- a/abydos/fingerprint/_phonetic.py +++ b/abydos/fingerprint/_phonetic.py @@ -28,10 +28,15 @@ unicode_literals, ) +from deprecation import deprecated + from six import text_type from ._string import String -from ..phonetic import double_metaphone +from .. import __version__ +from ..phonetic import DoubleMetaphone, double_metaphone +from ..phonetic._phonetic import _Phonetic + __all__ = ['Phonetic', 'phonetic_fingerprint'] @@ -44,32 +49,37 @@ class Phonetic(String): function after converting the string to its phonetic form, as determined by some phonetic algorithm. This fingerprint is described at :cite:`OpenRefine:2012`. + + .. versionadded:: 0.3.6 """ - def fingerprint( - self, - phrase, - phonetic_algorithm=double_metaphone, - joiner=' ', - *args, - **kwargs - ): - """Return the phonetic fingerprint of a phrase. + def __init__(self, phonetic_algorithm=None, joiner=' '): + """Initialize Phonetic instance. - Parameters - ---------- - phrase : str - The string from which to calculate the phonetic fingerprint phonetic_algorithm : function A phonetic algorithm that takes a string and returns a string (presumably a phonetic representation of the original string). By default, this function uses :py:func:`.double_metaphone`. joiner : str The string that will be placed between each word - *args - Variable length argument list - **kwargs - Arbitrary keyword arguments + + + .. versionadded:: 0.4.0 + + """ + self._phonetic_algorithm = phonetic_algorithm + if phonetic_algorithm is None: + self._phonetic_algorithm = DoubleMetaphone() + + self._joiner = joiner + + def fingerprint(self, phrase): + """Return the phonetic fingerprint of a phrase. + + Parameters + ---------- + phrase : str + The string from which to calculate the phonetic fingerprint Returns ------- @@ -81,22 +91,37 @@ def fingerprint( >>> pf = Phonetic() >>> pf.fingerprint('The quick brown fox jumped over the lazy dog.') '0 afr fks jmpt kk ls prn tk' - >>> from abydos.phonetic import soundex - >>> pf.fingerprint('The quick brown fox jumped over the lazy dog.', - ... phonetic_algorithm=soundex) + + >>> from abydos.phonetic import Soundex + >>> pf = Phonetic(Soundex()) + >>> pf.fingerprint('The quick brown fox jumped over the lazy dog.') 'b650 d200 f200 j513 l200 o160 q200 t000' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ phonetic = '' for word in phrase.split(): - word = phonetic_algorithm(word, *args, **kwargs) + if isinstance(self._phonetic_algorithm, _Phonetic): + word = self._phonetic_algorithm.encode(word) + else: + word = self._phonetic_algorithm(word) if not isinstance(word, text_type) and hasattr(word, '__iter__'): word = word[0] - phonetic += word + joiner - phonetic = phonetic[: -len(joiner)] - return super(self.__class__, self).fingerprint(phonetic) + phonetic += word + self._joiner + phonetic = phonetic[: -len(self._joiner)] + return super(Phonetic, self).fingerprint(phonetic) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Phonetic.fingerprint method instead.', +) def phonetic_fingerprint( phrase, phonetic_algorithm=double_metaphone, joiner=' ', *args, **kwargs ): @@ -128,15 +153,18 @@ def phonetic_fingerprint( -------- >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.') '0 afr fks jmpt kk ls prn tk' + >>> from abydos.phonetic import soundex >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.', ... phonetic_algorithm=soundex) 'b650 d200 f200 j513 l200 o160 q200 t000' + .. versionadded:: 0.1.0 + """ - return Phonetic().fingerprint( - phrase, phonetic_algorithm, joiner, *args, **kwargs - ) + return Phonetic( + lambda phrase: phonetic_algorithm(phrase, *args, **kwargs), joiner + ).fingerprint(phrase) if __name__ == '__main__': diff --git a/abydos/fingerprint/_position.py b/abydos/fingerprint/_position.py index 4356c717e..45bd97b1e 100644 --- a/abydos/fingerprint/_position.py +++ b/abydos/fingerprint/_position.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._fingerprint import MOST_COMMON_LETTERS_CG, _Fingerprint +from .. import __version__ __all__ = ['Position', 'position_fingerprint'] @@ -37,27 +40,38 @@ class Position(_Fingerprint): """Position Fingerprint. Based on the position fingerprint from :cite:`Cislak:2017`. + + .. versionadded:: 0.3.6 """ - def fingerprint( - self, - word, - n_bits=16, - most_common=MOST_COMMON_LETTERS_CG, - bits_per_letter=3, + def __init__( + self, n_bits=16, most_common=MOST_COMMON_LETTERS_CG, bits_per_letter=3 ): - """Return the position fingerprint. + """Initialize Count instance. Parameters ---------- - word : str - The word to fingerprint n_bits : int Number of bits in the fingerprint returned most_common : list The most common tokens in the target language, ordered by frequency - bits_per_letter : int - The bits to assign for letter position + + + .. versionadded:: 0.4.0 + + """ + super(_Fingerprint, self).__init__() + self._n_bits = n_bits + self._most_common = most_common + self._bits_per_letter = bits_per_letter + + def fingerprint(self, word): + """Return the position fingerprint. + + Parameters + ---------- + word : str + The word to fingerprint Returns ------- @@ -77,24 +91,30 @@ def fingerprint( >>> bin(position_fingerprint('entreatment')) '0b101011111111' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ + n_bits = self._n_bits position = {} for pos, letter in enumerate(word): - if letter not in position and letter in most_common: - position[letter] = min(pos, 2 ** bits_per_letter - 1) + if letter not in position and letter in self._most_common: + position[letter] = min(pos, 2 ** self._bits_per_letter - 1) fingerprint = 0 - for letter in most_common: + for letter in self._most_common: if n_bits: - fingerprint <<= min(bits_per_letter, n_bits) + fingerprint <<= min(self._bits_per_letter, n_bits) if letter in position: fingerprint += min(position[letter], 2 ** n_bits - 1) else: fingerprint += min( - 2 ** bits_per_letter - 1, 2 ** n_bits - 1 + 2 ** self._bits_per_letter - 1, 2 ** n_bits - 1 ) - n_bits -= min(bits_per_letter, n_bits) + n_bits -= min(self._bits_per_letter, n_bits) else: break @@ -105,6 +125,12 @@ def fingerprint( return fingerprint +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Position.fingerprint method instead.', +) def position_fingerprint( word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG, bits_per_letter=3 ): @@ -141,8 +167,10 @@ def position_fingerprint( >>> bin(position_fingerprint('entreatment')) '0b101011111111' + .. versionadded:: 0.3.0 + """ - return Position().fingerprint(word, n_bits, most_common, bits_per_letter) + return Position(n_bits, most_common, bits_per_letter).fingerprint(word) if __name__ == '__main__': diff --git a/abydos/fingerprint/_qgram.py b/abydos/fingerprint/_qgram.py index 2d03bd543..dcc84bc9c 100644 --- a/abydos/fingerprint/_qgram.py +++ b/abydos/fingerprint/_qgram.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._fingerprint import _Fingerprint +from .. import __version__ from ..tokenizer import QGrams __all__ = ['QGram', 'qgram_fingerprint'] @@ -44,15 +47,13 @@ class QGram(_Fingerprint): A q-gram fingerprint is a string consisting of all of the unique q-grams in a string, alphabetized & concatenated. This fingerprint is described at :cite:`OpenRefine:2012`. + + .. versionadded:: 0.3.6 """ - def fingerprint(self, phrase, qval=2, start_stop='', joiner=''): - """Return Q-Gram fingerprint. + def __init__(self, qval=2, start_stop='', joiner='', skip=0): + """Initialize Q-Gram fingerprinter. - Parameters - ---------- - phrase : str - The string from which to calculate the q-gram fingerprint qval : int The length of each q-gram (by default 2) start_stop : str @@ -60,6 +61,24 @@ def fingerprint(self, phrase, qval=2, start_stop='', joiner=''): phrase, as defined in :py:class:`tokenizer.QGrams` joiner : str The string that will be placed between each word + skip : int or Iterable + The number of characters to skip, can be an integer, range object, + or list + + + .. versionadded:: 0.4.0 + + """ + self._tokenizer = QGrams(qval, start_stop, skip) + self._joiner = joiner + + def fingerprint(self, phrase): + """Return Q-Gram fingerprint. + + Parameters + ---------- + phrase : str + The string from which to calculate the q-gram fingerprint Returns ------- @@ -76,14 +95,25 @@ def fingerprint(self, phrase, qval=2, start_stop='', joiner=''): >>> qf.fingerprint('Niall') 'aliallni' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower())) phrase = ''.join(c for c in phrase if c.isalnum()) - phrase = QGrams(phrase, qval, start_stop) - phrase = joiner.join(sorted(phrase)) + phrase = self._tokenizer.tokenize(phrase).get_set() + phrase = self._joiner.join(sorted(phrase)) return phrase +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the QGram.fingerprint method instead.', +) def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''): """Return Q-Gram fingerprint. @@ -115,8 +145,12 @@ def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''): >>> qgram_fingerprint('Niall') 'aliallni' + .. versionadded:: 0.1.0 + """ - return QGram().fingerprint(phrase, qval, start_stop, joiner) + return QGram(qval=qval, start_stop=start_stop, joiner=joiner).fingerprint( + phrase + ) if __name__ == '__main__': diff --git a/abydos/fingerprint/_skeleton_key.py b/abydos/fingerprint/_skeleton_key.py index ac7b3a25c..b7f7b9994 100644 --- a/abydos/fingerprint/_skeleton_key.py +++ b/abydos/fingerprint/_skeleton_key.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._fingerprint import _Fingerprint +from .. import __version__ __all__ = ['SkeletonKey', 'skeleton_key'] @@ -41,6 +44,8 @@ class SkeletonKey(_Fingerprint): """Skeleton Key. The skeleton key of a word is defined in :cite:`Pollock:1984`. + + .. versionadded:: 0.3.6 """ _vowels = set('AEIOU') @@ -69,6 +74,11 @@ def fingerprint(self, word): >>> sk.fingerprint('Niall') 'NLIA' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._letters) @@ -89,6 +99,12 @@ def fingerprint(self, word): return start + consonant_part + vowel_part +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SkeletonKey.fingerprint method instead.', +) def skeleton_key(word): """Return the skeleton key. @@ -113,6 +129,8 @@ def skeleton_key(word): >>> skeleton_key('Niall') 'NLIA' + .. versionadded:: 0.1.0 + """ return SkeletonKey().fingerprint(word) diff --git a/abydos/fingerprint/_string.py b/abydos/fingerprint/_string.py index 97554aeea..140119ede 100644 --- a/abydos/fingerprint/_string.py +++ b/abydos/fingerprint/_string.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._fingerprint import _Fingerprint +from .. import __version__ __all__ = ['String', 'str_fingerprint'] @@ -43,17 +46,31 @@ class String(_Fingerprint): The fingerprint of a string is a string consisting of all of the unique words in a string, alphabetized & concatenated with intervening joiners. This fingerprint is described at :cite:`OpenRefine:2012`. + + .. versionadded:: 0.3.6 """ - def fingerprint(self, phrase, joiner=' '): + def __init__(self, joiner=' '): + """Initialize String instance. + + Parameters + ---------- + joiner : str + The string that will be placed between each word + + + .. versionadded:: 0.4.0 + + """ + self._joiner = joiner + + def fingerprint(self, phrase): """Return string fingerprint. Parameters ---------- phrase : str The string from which to calculate the fingerprint - joiner : str - The string that will be placed between each word Returns ------- @@ -66,13 +83,24 @@ def fingerprint(self, phrase, joiner=' '): >>> sf.fingerprint('The quick brown fox jumped over the lazy dog.') 'brown dog fox jumped lazy over quick the' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower())) phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()]) - phrase = joiner.join(sorted(list(set(phrase.split())))) + phrase = self._joiner.join(sorted(list(set(phrase.split())))) return phrase +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the String.fingerprint method instead.', +) def str_fingerprint(phrase, joiner=' '): """Return string fingerprint. @@ -95,8 +123,10 @@ def str_fingerprint(phrase, joiner=' '): >>> str_fingerprint('The quick brown fox jumped over the lazy dog.') 'brown dog fox jumped lazy over quick the' + .. versionadded:: 0.1.0 + """ - return String().fingerprint(phrase, joiner) + return String(joiner).fingerprint(phrase) if __name__ == '__main__': diff --git a/abydos/fingerprint/_synoname.py b/abydos/fingerprint/_synoname_toolcode.py similarity index 97% rename from abydos/fingerprint/_synoname.py rename to abydos/fingerprint/_synoname_toolcode.py index f3e891067..b6ea20d9b 100644 --- a/abydos/fingerprint/_synoname.py +++ b/abydos/fingerprint/_synoname_toolcode.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._fingerprint import _Fingerprint +from .. import __version__ __all__ = ['SynonameToolcode', 'synoname_toolcode'] @@ -37,6 +40,8 @@ class SynonameToolcode(_Fingerprint): """Synoname Toolcode. Cf. :cite:`Getty:1991,Gross:1991`. + + .. versionadded:: 0.3.6 """ _synoname_special_table = ( @@ -305,6 +310,11 @@ def fingerprint(self, lname, fname='', qual='', normalize=0): >>> st.fingerprint('Michelangelo IV', '', 'Workshop of') ('michelangelo iv', '', '3000550015$055b$mi') + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ lname = lname.lower() fname = fname.lower() @@ -400,6 +410,8 @@ def roman_check(numeral, fname, lname): tuple First and last names with Roman numeral moved + .. versionadded:: 0.3.0 + """ loc = fname.find(numeral) if fname and ( @@ -492,6 +504,12 @@ def roman_check(numeral, fname, lname): return lname, fname, ''.join(toolcode) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SynonameToolcode.fingerprint method instead.', +) def synoname_toolcode(lname, fname='', qual='', normalize=0): """Build the Synoname toolcode. @@ -531,6 +549,8 @@ def synoname_toolcode(lname, fname='', qual='', normalize=0): >>> synoname_toolcode('Michelangelo IV', '', 'Workshop of') ('michelangelo iv', '', '3000550015$055b$mi') + .. versionadded:: 0.3.0 + """ return SynonameToolcode().fingerprint(lname, fname, qual, normalize) diff --git a/abydos/phones/_phones.py b/abydos/phones/_phones.py index 34032553c..5c09dcec9 100644 --- a/abydos/phones/_phones.py +++ b/abydos/phones/_phones.py @@ -609,6 +609,8 @@ def ipa_to_features(ipa): [2783230754502126250, 1826957430176000426, 2693158761954453926, 2783230754501863834] + .. versionadded:: 0.1.0 + """ features = [] pos = 0 @@ -697,6 +699,8 @@ def get_feature(vector, feature): >>> get_feature(tails, 'coronal') [1, -1, 1, 1] + .. versionadded:: 0.1.0 + """ # :param bool binary: if False, -1, 0, & 1 represent -, 0, & + # if True, only binary oppositions are allowed: @@ -794,6 +798,8 @@ def cmp_features(feat1, feat2): >>> cmp_features(ipa_to_features('l')[0], ipa_to_features('i')[0]) 0.564516129032258 + .. versionadded:: 0.1.0 + """ if feat1 < 0 or feat2 < 0: return -1.0 diff --git a/abydos/phonetic/__init__.py b/abydos/phonetic/__init__.py index 9c796208d..a208afaed 100644 --- a/abydos/phonetic/__init__.py +++ b/abydos/phonetic/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -37,7 +37,7 @@ - Phonix (:py:class:`.Phonix`) - Standardized Phonetic Frequency Code (:py:class:`.SPFC`) - Statistics Canada (:py:class:`.StatisticsCanada`) - - Lein (:py:class:`.Lein`) + - LEIN (:py:class:`.LEIN`) - Roger Root (:py:class:`.RogerRoot`) - Eudex phonetic hash (:py:class:`.phonetic.Eudex`) - Parmar-Kumbharana (:py:class:`.ParmarKumbharana`) @@ -69,6 +69,7 @@ For Swedish: - SfinxBis (:py:class:`.SfinxBis`) + - Wåhlin (:py:class:`.Waahlin`) For Norwegian: @@ -125,7 +126,7 @@ koelner_phonetik_alpha, koelner_phonetik_num_to_alpha, ) -from ._lein import Lein, lein +from ._lein import LEIN, lein from ._meta_soundex import MetaSoundex, metasoundex from ._metaphone import Metaphone, metaphone from ._mra import MRA, mra @@ -136,6 +137,7 @@ from ._parmar_kumbharana import ParmarKumbharana, parmar_kumbharana from ._phonem import Phonem, phonem from ._phonet import Phonet, phonet +from ._phonetic import _Phonetic from ._phonetic_spanish import PhoneticSpanish, phonetic_spanish from ._phonex import Phonex, phonex from ._phonix import Phonix, phonix @@ -157,8 +159,10 @@ from ._spanish_metaphone import SpanishMetaphone, spanish_metaphone from ._spfc import SPFC, spfc from ._statistics_canada import StatisticsCanada, statistics_canada +from ._waahlin import Waahlin __all__ = [ + '_Phonetic', 'RussellIndex', 'russell_index', 'russell_index_num_to_alpha', @@ -171,7 +175,7 @@ 'dm_soundex', 'FuzzySoundex', 'fuzzy_soundex', - 'Lein', + 'LEIN', 'lein', 'Phonex', 'phonex', @@ -241,6 +245,7 @@ 'spanish_metaphone', 'SfinxBis', 'sfinxbis', + 'Waahlin', 'Norphone', 'norphone', ] diff --git a/abydos/phonetic/_alpha_sis.py b/abydos/phonetic/_alpha_sis.py index 9f95f5b76..ac963a75a 100644 --- a/abydos/phonetic/_alpha_sis.py +++ b/abydos/phonetic/_alpha_sis.py @@ -30,10 +30,13 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['AlphaSIS', 'alpha_sis'] @@ -43,6 +46,8 @@ class AlphaSIS(_Phonetic): The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`. This implementation is based on the description in :cite:`Moore:1977`. + + .. versionadded:: 0.3.6 """ _alpha_sis_initials = { @@ -154,7 +159,68 @@ class AlphaSIS(_Phonetic): 'P', ) - def encode(self, word, max_length=14): + _alphabetic_initials = dict(zip((ord(_) for _ in '012345'), ' AHJWY')) + _alphabetic_non_initials = dict( + zip((ord(_) for _ in '0123456789'), 'STNMRLJKFP') + ) + + def __init__(self, max_length=14): + """Initialize AlphaSIS instance. + + Parameters + ---------- + max_length : int + The length of the code returned (defaults to 14) + + + .. versionadded:: 0.4.0 + + """ + # Clamp max_length to [4, 64] + if max_length != -1: + self._max_length = min(max(4, max_length), 64) + else: + self._max_length = 64 + + def encode_alpha(self, word): + """Return the alphabetic Alpha-SIS code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + tuple + The alphabetic Alpha-SIS value + + Examples + -------- + >>> pe = AlphaSIS() + >>> pe.encode_alpha('Christopher') + ('JRSTFR', 'KSRSTFR', 'RSTFR') + >>> pe.encode_alpha('Niall') + ('NL',) + >>> pe.encode_alpha('Smith') + ('MT',) + >>> pe.encode_alpha('Schmidt') + ('JMT',) + + + .. versionadded:: 0.4.0 + + """ + codes = self.encode(word) + alphas = [ + code[0].translate(self._alphabetic_initials).strip() + + code[1:].translate(self._alphabetic_non_initials).rstrip('S') + for code in codes + ] + + return tuple(alphas) + + def encode(self, word): """Return the IBM Alpha Search Inquiry System code for a word. A collection is necessary as the return type since there can be @@ -165,8 +231,6 @@ def encode(self, word, max_length=14): ---------- word : str The word to transform - max_length : int - The length of the code returned (defaults to 14) Returns ------- @@ -185,6 +249,11 @@ def encode(self, word, max_length=14): >>> pe.encode('Schmidt') ('06310000000000',) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ alpha = [''] pos = 0 @@ -192,12 +261,6 @@ def encode(self, word, max_length=14): word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) - # Clamp max_length to [4, 64] - if max_length != -1: - max_length = min(max(4, max_length), 64) - else: - max_length = 64 - # Do special processing for initial substrings for k in self._alpha_sis_initials_order: if word.startswith(k): @@ -240,10 +303,18 @@ def encode(self, word, max_length=14): alpha = (_.replace('_', '') for _ in alpha) # Trim codes and return tuple - alpha = ((_ + ('0' * max_length))[:max_length] for _ in alpha) + alpha = ( + (_ + ('0' * self._max_length))[: self._max_length] for _ in alpha + ) return tuple(alpha) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the AlphaSIS.encode method instead.', +) def alpha_sis(word, max_length=14): """Return the IBM Alpha Search Inquiry System code for a word. @@ -272,8 +343,10 @@ def alpha_sis(word, max_length=14): >>> alpha_sis('Schmidt') ('06310000000000',) + .. versionadded:: 0.1.0 + """ - return AlphaSIS().encode(word, max_length) + return AlphaSIS(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_beider_morse.py b/abydos/phonetic/_beider_morse.py index a24ea87ac..b6f71a188 100644 --- a/abydos/phonetic/_beider_morse.py +++ b/abydos/phonetic/_beider_morse.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # This file is based on Alexander Beider and Stephen P. Morse's implementation @@ -35,6 +35,8 @@ from re import search from unicodedata import normalize +from deprecation import deprecated + from six import PY3, text_type from six.moves import range @@ -63,6 +65,7 @@ L_TURKISH, ) from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['BeiderMorse', 'bmpm'] @@ -107,7 +110,7 @@ 'du ', 'van ', 'von ', - 'd\'', + "d'", } BMDATA['sep']['discards'] = { 'al', @@ -142,6 +145,8 @@ class BeiderMorse(_Phonetic): The Beider-Morse Phonetic Matching algorithm is described in :cite:`Beider:2008`. The reference implementation is licensed under GPLv3. + + .. versionadded:: 0.3.6 """ def _language(self, name, name_mode): @@ -160,6 +165,11 @@ def _language(self, name, name_mode): int Language ID + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ name = name.strip().lower() rules = BMDATA[name_mode]['language_rules'] @@ -206,6 +216,11 @@ def _redo_language( str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ language_arg = self._language(term, name_mode) return self._phonetic( @@ -253,6 +268,11 @@ def _phonetic( str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ term = term.replace('-', ' ').strip() @@ -299,7 +319,7 @@ def _phonetic( # this is a bug, but I won't try to fix it now for word in words: - word = word[word.rfind('\'') + 1 :] + word = word[word.rfind("'") + 1 :] if word not in BMDATA['sep']['discards']: words2.append(word) @@ -416,6 +436,11 @@ def _apply_final_rules(self, phonetic, final_rules, language_arg, strip): str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # optimization to save time if not final_rules: @@ -513,6 +538,11 @@ def _phonetic_number(self, phonetic): str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if '[' in phonetic: return phonetic[: phonetic.find('[')] @@ -520,7 +550,7 @@ def _phonetic_number(self, phonetic): return phonetic # experimental !!!! def _expand_alternates(self, phonetic): - """Expand phonetic alternates separated by |s. + r"""Expand phonetic alternates separated by \|s. Parameters ---------- @@ -532,6 +562,11 @@ def _expand_alternates(self, phonetic): str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ alt_start = phonetic.find('(') if alt_start == -1: @@ -569,6 +604,11 @@ def _pnums_with_leading_space(self, phonetic): str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ alt_start = phonetic.find('(') if alt_start == -1: @@ -603,6 +643,11 @@ def _phonetic_numbers(self, phonetic): str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ phonetic_array = phonetic.split('-') # for names with spaces in them result = ' '.join( @@ -623,6 +668,11 @@ def _remove_dupes(self, phonetic): str A Beider-Morse phonetic code + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ alt_string = phonetic alt_array = alt_string.split('|') @@ -664,6 +714,11 @@ def _normalize_lang_attrs(self, text, strip): ValueError No closing square bracket + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ uninitialized = -1 # all 1's attrib = uninitialized @@ -721,6 +776,11 @@ def _apply_rule_if_compat(self, phonetic, target, language_arg): str A candidate encoding + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ candidate = phonetic + target if '[' not in candidate: # no attributes so we need test no further @@ -774,6 +834,11 @@ def _language_index_from_code(self, code, name_mode): int Language code index + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if code < 1 or code > sum( _LANG_DICT[_] for _ in BMDATA[name_mode]['languages'] @@ -785,22 +850,19 @@ def _language_index_from_code(self, code, name_mode): return L_ANY return code - def encode( + def __init__( self, - word, language_arg=0, name_mode='gen', match_mode='approx', concat=False, filter_langs=False, ): - """Return the Beider-Morse Phonetic Matching encoding(s) of a term. + """Initialize BeiderMorse instance. Parameters ---------- - word : str - The word to transform - language_arg : int + language_arg : str or int The language of the term; supported values include: - ``any`` @@ -838,6 +900,49 @@ def encode( filter_langs : bool Filter out incompatible languages + + .. versionadded:: 0.4.0 + + """ + name_mode = name_mode.strip().lower()[:3] + if name_mode not in {'ash', 'sep', 'gen'}: + name_mode = 'gen' + + if match_mode != 'exact': + match_mode = 'approx' + + # Translate the supplied language_arg value into an integer + # representing a set of languages + all_langs = ( + sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1 + ) + lang_choices = 0 + if isinstance(language_arg, (int, float, long)): + self._lang_choices = int(language_arg) + elif language_arg != '' and isinstance(language_arg, (text_type, str)): + for lang in text_type(language_arg).lower().split(','): + if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs): + lang_choices += _LANG_DICT[lang] + elif not filter_langs: + raise ValueError( + "Unknown '" + name_mode + "' language: '" + lang + "'" + ) + + self._language_arg = language_arg + self._name_mode = name_mode + self._match_mode = match_mode + self._concat = concat + self._filter_langs = filter_langs + self._lang_choices = lang_choices + + def encode(self, word): + """Return the Beider-Morse Phonetic Matching encoding(s) of a term. + + Parameters + ---------- + word : str + The word to transform + Returns ------- tuple @@ -863,76 +968,63 @@ def encode( >>> pe.encode('Schmidt') 'zmit stzmit' - >>> pe.encode('Christopher', language_arg='German') + >>> BeiderMorse(language_arg='German').encode('Christopher') 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir xristYfir' - >>> pe.encode('Christopher', language_arg='English') + >>> BeiderMorse(language_arg='English').encode('Christopher') 'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir xrQstafir' - >>> pe.encode('Christopher', language_arg='German', name_mode='ash') + >>> BeiderMorse(language_arg='German', + ... name_mode='ash').encode('Christopher') 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir xristYfir' - >>> pe.encode('Christopher', language_arg='German', match_mode='exact') + >>> BeiderMorse(language_arg='German', + ... match_mode='exact').encode('Christopher') 'xriStopher xriStofer xristopher xristofer' - """ - word = normalize('NFC', text_type(word.strip().lower())) - - name_mode = name_mode.strip().lower()[:3] - if name_mode not in {'ash', 'sep', 'gen'}: - name_mode = 'gen' - if match_mode != 'exact': - match_mode = 'approx' + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class - # Translate the supplied language_arg value into an integer - # representing a set of languages - all_langs = ( - sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1 - ) - lang_choices = 0 - if isinstance(language_arg, (int, float, long)): - lang_choices = int(language_arg) - elif language_arg != '' and isinstance(language_arg, (text_type, str)): - for lang in text_type(language_arg).lower().split(','): - if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs): - lang_choices += _LANG_DICT[lang] - elif not filter_langs: - raise ValueError( - 'Unknown \'' - + name_mode - + '\' language: \'' - + lang - + '\'' - ) + """ + word = normalize('NFC', text_type(word.strip().lower())) # Language choices are either all incompatible with the name mode or # no choices were given, so try to autodetect - if lang_choices == 0: - language_arg = self._language(word, name_mode) + if self._lang_choices == 0: + language_arg = self._language(word, self._name_mode) else: - language_arg = lang_choices - language_arg2 = self._language_index_from_code(language_arg, name_mode) + language_arg = self._lang_choices + language_arg2 = self._language_index_from_code( + language_arg, self._name_mode + ) - rules = BMDATA[name_mode]['rules'][language_arg2] - final_rules1 = BMDATA[name_mode][match_mode]['common'] - final_rules2 = BMDATA[name_mode][match_mode][language_arg2] + rules = BMDATA[self._name_mode]['rules'][language_arg2] + final_rules1 = BMDATA[self._name_mode][self._match_mode]['common'] + final_rules2 = BMDATA[self._name_mode][self._match_mode][language_arg2] result = self._phonetic( word, - name_mode, + self._name_mode, rules, final_rules1, final_rules2, language_arg, - concat, + self._concat, ) result = self._phonetic_numbers(result) return result +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the BeiderMorse.encode method instead.', +) def bmpm( word, language_arg=0, @@ -1019,10 +1111,12 @@ def bmpm( >>> bmpm('Christopher', language_arg='German', match_mode='exact') 'xriStopher xriStofer xristopher xristofer' + .. versionadded:: 0.1.0 + """ - return BeiderMorse().encode( - word, language_arg, name_mode, match_mode, concat, filter_langs - ) + return BeiderMorse( + language_arg, name_mode, match_mode, concat, filter_langs + ).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_beider_morse_data.py b/abydos/phonetic/_beider_morse_data.py index c12687fde..79605801c 100644 --- a/abydos/phonetic/_beider_morse_data.py +++ b/abydos/phonetic/_beider_morse_data.py @@ -891,7 +891,7 @@ # 1. following are rules to accept the language # 1.1 Special letter combinations ('^o’', 32, True), - ('^o\'', 32, True), + ("^o'", 32, True), ('^mc', 32, True), ('^fitz', 32, True), ('ceau', 65600, True), @@ -1062,7 +1062,7 @@ ('ث', 2, True), # tha' ('ج', 2, True), # jim ('ح', 2, True), # h.a' - ('خ\'', 2, True), # kha' + ("خ\'", 2, True), # kha' ('د', 2, True), # dal (isol + init) ('ذ', 2, True), # dhal (isol + init) ('ر', 2, True), # ra' (isol + init) @@ -1489,7 +1489,7 @@ ('ź', '', '', '(Z[16384]|z)'), ('ž', '', '', 'Z'), # czech + latvian ('ß', '', '', 's'), # german - ('\'', '', '', ''), # russian + ("'", '', '', ''), # russian ('"', '', '', ''), # russian ('o', '', '[bcćdgklłmnńrsśtwzźż]', '(O|P[16384])'), # LATIN ALPHABET @@ -1794,7 +1794,7 @@ _GEN_RULES_ENGLISH = ( # CONSONANTS ('’', '', '', ''), # O’Neill - ('\'', '', '', ''), # O’Neill + ("'", '', '', ''), # O’Neill ('mc', '^', '', 'mak'), # McDonald ('tz', '', '', 'ts'), # Fitzgerald ('tch', '', '', 'tS'), @@ -2908,7 +2908,7 @@ ('ee', '', '', '(aje|i)'), ('e', '[aou]', '', 'je'), ('oo', '', '', '(oo|u)'), - ('\'', '', '', ''), + ("'", '', '', ''), ('"', '', '', ''), ('aue', '', '', 'aue'), # LATIN ALPHABET @@ -4739,13 +4739,13 @@ ('gauz$', 131072, True), ('gauz$', 131072, True), ('goltz$', 131072, True), - ('gol\'tz$', 131072, True), + ("gol'tz$", 131072, True), ('golts$', 131072, True), - ('gol\'ts$', 131072, True), + ("gol'ts$", 131072, True), ('^goltz', 131072, True), - ('^gol\'tz', 131072, True), + ("^gol'tz", 131072, True), ('^golts', 131072, True), - ('^gol\'ts', 131072, True), + ("^gol'ts", 131072, True), ('gendler$', 131072, True), ('gejmer$', 131072, True), ('gejm$', 131072, True), @@ -4992,13 +4992,13 @@ ('gh', '', '[ei]', '(g[65536]|gh)'), ('gauz', '', '$', 'haus'), ('gaus', '', '$', 'haus'), - ('gol\'ts', '', '$', 'holts'), + ("gol'ts", '', '$', 'holts'), ('golts', '', '$', 'holts'), - ('gol\'tz', '', '$', 'holts'), + ("gol'tz", '', '$', 'holts'), ('goltz', '', '', 'holts'), - ('gol\'ts', '^', '', 'holts'), + ("gol'ts", '^', '', 'holts'), ('golts', '^', '', 'holts'), - ('gol\'tz', '^', '', 'holts'), + ("gol'tz", '^', '', 'holts'), ('goltz', '^', '', 'holts'), ('gendler', '', '$', 'hendler'), ('gejmer', '', '$', 'hajmer'), @@ -5130,7 +5130,7 @@ ('ú', '', '', 'u'), ('ű', '', '', 'Q'), # hungarian ('ß', '', '', 's'), # german - ('\'', '', '', ''), + ("'", '', '', ''), ('"', '', '', ''), ('a', '', '[bcdgkpstwzż]', '(A|B[16384])'), ('e', '', '[bcdgkpstwzż]', '(E|F[16384])'), @@ -5872,9 +5872,9 @@ ('zyo', '', '', 'zo'), ('gauz', '', '$', 'haus'), ('gaus', '', '$', 'haus'), - ('gol\'ts', '', '$', 'holts'), + ("gol'ts", '', '$', 'holts'), ('golts', '', '$', 'holts'), - ('gol\'tz', '', '$', 'holts'), + ("gol'tz", '', '$', 'holts'), ('goltz', '', '$', 'holts'), ('gejmer', '', '$', 'hajmer'), ('gejm', '', '$', 'hajm'), @@ -5967,7 +5967,7 @@ ('e', '[aou]', '', 'je'), ('y', '', '', 'I'), ('oo', '', '', '(oo|u)'), # not in DJSRE - ('\'', '', '', ''), + ("'", '', '', ''), ('"', '', '', ''), ('aue', '', '', 'aue'), # TRIVIAL diff --git a/abydos/phonetic/_caverphone.py b/abydos/phonetic/_caverphone.py index 034ec5578..4b0ca8407 100644 --- a/abydos/phonetic/_caverphone.py +++ b/abydos/phonetic/_caverphone.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Caverphone', 'caverphone'] @@ -41,17 +44,72 @@ class Caverphone(_Phonetic): A description of version 2 of the algorithm can be found in :cite:`Hood:2004`. + + .. versionadded:: 0.3.6 """ - def encode(self, word, version=2): + def __init__(self, version=2): + """Initialize Caverphone instance. + + Parameters + ---------- + version : int + The version of Caverphone to employ for encoding (defaults to 2) + + + .. versionadded:: 0.4.0 + + """ + self._version = version + + def encode_alpha(self, word): + """Return the alphabetic Caverphone code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Caverphone value + + Examples + -------- + >>> pe = Caverphone() + >>> pe.encode_alpha('Christopher') + 'KRSTFA' + >>> pe.encode_alpha('Niall') + 'NA' + >>> pe.encode_alpha('Smith') + 'SMT' + >>> pe.encode_alpha('Schmidt') + 'SKMT' + + >>> pe_1 = Caverphone(version=1) + >>> pe_1.encode_alpha('Christopher') + 'KRSTF' + >>> pe_1.encode_alpha('Niall') + 'N' + >>> pe_1.encode_alpha('Smith') + 'SMT' + >>> pe_1.encode_alpha('Schmidt') + 'SKMT' + + + .. versionadded:: 0.4.0 + + """ + return self.encode(word).rstrip('1') + + def encode(self, word): """Return the Caverphone code for a word. Parameters ---------- word : str The word to transform - version : int - The version of Caverphone to employ for encoding (defaults to 2) Returns ------- @@ -70,15 +128,21 @@ def encode(self, word, version=2): >>> pe.encode('Schmidt') 'SKMT111111' - >>> pe.encode('Christopher', 1) + >>> pe_1 = Caverphone(version=1) + >>> pe_1.encode('Christopher') 'KRSTF1' - >>> pe.encode('Niall', 1) + >>> pe_1.encode('Niall') 'N11111' - >>> pe.encode('Smith', 1) + >>> pe_1.encode('Smith') 'SMT111' - >>> pe.encode('Schmidt', 1) + >>> pe_1.encode('Schmidt') 'SKMT11' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = word.lower() word = ''.join(c for c in word if c in self._lc_set) @@ -98,13 +162,15 @@ def _squeeze_replace(word, char): str The word with instances of char squeezed down to one + .. versionadded:: 0.1.0 + """ while char * 2 in word: word = word.replace(char * 2, char) return word.replace(char, char.upper()) # the main replacement algorithm - if version != 1 and word[-1:] == 'e': + if self._version != 1 and word[-1:] == 'e': word = word[:-1] if word: if word[:5] == 'cough': @@ -115,7 +181,7 @@ def _squeeze_replace(word, char): word = 'tou2f' + word[5:] if word[:6] == 'enough': word = 'enou2f' + word[6:] - if version != 1 and word[:6] == 'trough': + if self._version != 1 and word[:6] == 'trough': word = 'trou2f' + word[6:] if word[:2] == 'gn': word = '2n' + word[2:] @@ -145,7 +211,7 @@ def _squeeze_replace(word, char): word = 'A' + word[1:] for vowel in 'aeiou': word = word.replace(vowel, '3') - if version != 1: + if self._version != 1: word = word.replace('j', 'y') if word[:2] == 'y3': word = 'Y3' + word[2:] @@ -159,41 +225,41 @@ def _squeeze_replace(word, char): word = _squeeze_replace(word, char) word = word.replace('w3', 'W3') - if version == 1: + if self._version == 1: word = word.replace('wy', 'Wy') word = word.replace('wh3', 'Wh3') - if version == 1: + if self._version == 1: word = word.replace('why', 'Why') - if version != 1 and word[-1:] == 'w': + if self._version != 1 and word[-1:] == 'w': word = word[:-1] + '3' word = word.replace('w', '2') if word[:1] == 'h': word = 'A' + word[1:] word = word.replace('h', '2') word = word.replace('r3', 'R3') - if version == 1: + if self._version == 1: word = word.replace('ry', 'Ry') - if version != 1 and word[-1:] == 'r': + if self._version != 1 and word[-1:] == 'r': word = word[:-1] + '3' word = word.replace('r', '2') word = word.replace('l3', 'L3') - if version == 1: + if self._version == 1: word = word.replace('ly', 'Ly') - if version != 1 and word[-1:] == 'l': + if self._version != 1 and word[-1:] == 'l': word = word[:-1] + '3' word = word.replace('l', '2') - if version == 1: + if self._version == 1: word = word.replace('j', 'y') word = word.replace('y3', 'Y3') word = word.replace('y', '2') word = word.replace('2', '') - if version != 1 and word[-1:] == '3': + if self._version != 1 and word[-1:] == '3': word = word[:-1] + 'A' word = word.replace('3', '') # pad with 1s, then extract the necessary length of code word += '1' * 10 - if version != 1: + if self._version != 1: word = word[:10] else: word = word[:6] @@ -201,6 +267,12 @@ def _squeeze_replace(word, char): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Caverphone.encode method instead.', +) def caverphone(word, version=2): """Return the Caverphone code for a word. @@ -238,8 +310,10 @@ def caverphone(word, version=2): >>> caverphone('Schmidt', 1) 'SKMT11' + .. versionadded:: 0.1.0 + """ - return Caverphone().encode(word, version) + return Caverphone(version).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_daitch_mokotoff.py b/abydos/phonetic/_daitch_mokotoff.py index 00cd61a13..f29197551 100644 --- a/abydos/phonetic/_daitch_mokotoff.py +++ b/abydos/phonetic/_daitch_mokotoff.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['DaitchMokotoff', 'dm_soundex'] @@ -43,6 +46,8 @@ class DaitchMokotoff(_Phonetic): Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values of a word as a set. A collection is necessary since there can be multiple values for a single word. + + .. versionadded:: 0.3.6 """ _dms_table = { @@ -257,13 +262,16 @@ class DaitchMokotoff(_Phonetic): _uc_v_set = set('AEIJOUY') - def encode(self, word, max_length=6, zero_pad=True): - """Return the Daitch-Mokotoff Soundex code for a word. + _alphabetic = dict(zip((ord(_) for _ in '0123456789'), 'AYstTSKNPLR')) + _alphabetic_non_initials = dict( + zip((ord(_) for _ in '0123456789'), ' A TSKNPLR') + ) + + def __init__(self, max_length=6, zero_pad=True): + """Initialize DaitchMokotoff instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to 6; must be between 6 and 64) @@ -271,6 +279,64 @@ def encode(self, word, max_length=6, zero_pad=True): Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + # Require a max_length of at least 6 and not more than 64 + if max_length != -1: + self._max_length = min(max(6, max_length), 64) + else: + self._max_length = 64 + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic Daitch-Mokotoff Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Daitch-Mokotoff Soundex value + + Examples + -------- + >>> pe = DaitchMokotoff() + >>> sorted(pe.encode_alpha('Christopher')) + ['KRSTPR', 'SRSTPR'] + >>> pe.encode_alpha('Niall') + {'NL'} + >>> pe.encode_alpha('Smith') + {'SNT'} + >>> pe.encode_alpha('Schmidt') + {'SNT'} + + >>> sorted(DaitchMokotoff(max_length=20, + ... zero_pad=False).encode_alpha('The quick brown fox')) + ['TKKPRPNPKS', 'TKSKPRPNPKS'] + + + .. versionadded:: 0.4.0 + + """ + alphas = { + code.rstrip('0').translate(self._alphabetic) + for code in self.encode(word) + } + return {code[:1] + code[1:].replace('Y', 'A') for code in alphas} + + def encode(self, word): + """Return the Daitch-Mokotoff Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -288,19 +354,18 @@ def encode(self, word, max_length=6, zero_pad=True): >>> pe.encode('Schmidt') {'463000'} - >>> sorted(pe.encode('The quick brown fox', max_length=20, - ... zero_pad=False)) + >>> sorted(DaitchMokotoff(max_length=20, + ... zero_pad=False).encode('The quick brown fox')) ['35457976754', '3557976754'] + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ dms = [''] # initialize empty code list - # Require a max_length of at least 6 and not more than 64 - if max_length != -1: - max_length = min(max(6, max_length), 64) - else: - max_length = 64 - # uppercase, normalize, decompose, and filter non-A-Z word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') @@ -308,8 +373,8 @@ def encode(self, word, max_length=6, zero_pad=True): # Nothing to convert, return base case if not word: - if zero_pad: - return {'0' * max_length} + if self._zero_pad: + return {'0' * self._max_length} return {'0'} pos = 0 @@ -351,13 +416,21 @@ def encode(self, word, max_length=6, zero_pad=True): ) # Trim codes and return set - if zero_pad: - dms = ((_ + ('0' * max_length))[:max_length] for _ in dms) + if self._zero_pad: + dms = ( + (_ + ('0' * self._max_length))[: self._max_length] for _ in dms + ) else: - dms = (_[:max_length] for _ in dms) + dms = (_[: self._max_length] for _ in dms) return set(dms) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the DaitchMokotoff.encode method instead.', +) def dm_soundex(word, max_length=6, zero_pad=True): """Return the Daitch-Mokotoff Soundex code for a word. @@ -393,8 +466,12 @@ def dm_soundex(word, max_length=6, zero_pad=True): ... zero_pad=False)) ['35457976754', '3557976754'] + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - return DaitchMokotoff().encode(word, max_length, zero_pad) + return DaitchMokotoff(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_davidson.py b/abydos/phonetic/_davidson.py index 7ceb517e4..a5eac4fa2 100644 --- a/abydos/phonetic/_davidson.py +++ b/abydos/phonetic/_davidson.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Davidson', 'davidson'] @@ -43,11 +46,28 @@ class Davidson(_Phonetic): :cite:`Dolby:1970` identifies this as having been the name compression algorithm used by SABRE. + + .. versionadded:: 0.3.6 """ _trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''} - def encode(self, lname, fname='.', omit_fname=False): + def __init__(self, omit_fname=False): + """Initialize Davidson instance. + + Parameters + ---------- + omit_fname : bool + Set to True to completely omit the first character of the first + name + + + .. versionadded:: 0.4.0 + + """ + self._omit_fname = omit_fname + + def encode(self, lname, fname='.'): """Return Davidson's Consonant Code. Parameters @@ -57,9 +77,6 @@ def encode(self, lname, fname='.', omit_fname=False): fname : str First name (optional), of which the first character is included in the code. - omit_fname : bool - Set to True to completely omit the first character of the first - name Returns ------- @@ -84,6 +101,11 @@ def encode(self, lname, fname='.', omit_fname=False): >>> pe.encode('Wasserman', 'Tabitha') 'WSRMT' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ lname = text_type(lname.upper()) code = self._delete_consecutive_repeats( @@ -91,12 +113,18 @@ def encode(self, lname, fname='.', omit_fname=False): ) code = code[:4] + (4 - len(code)) * ' ' - if not omit_fname: + if not self._omit_fname: code += fname[:1].upper() return code +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Davidson.encode method instead.', +) def davidson(lname, fname='.', omit_fname=False): """Return Davidson's Consonant Code. @@ -134,8 +162,10 @@ def davidson(lname, fname='.', omit_fname=False): >>> davidson('Wasserman', 'Tabitha') 'WSRMT' + .. versionadded:: 0.3.0 + """ - return Davidson().encode(lname, fname, omit_fname) + return Davidson(omit_fname).encode(lname, fname) if __name__ == '__main__': diff --git a/abydos/phonetic/_dolby.py b/abydos/phonetic/_dolby.py index 807885a35..83a7d09fb 100644 --- a/abydos/phonetic/_dolby.py +++ b/abydos/phonetic/_dolby.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Dolby', 'dolby'] @@ -42,15 +45,15 @@ class Dolby(_Phonetic): This follows "A Spelling Equivalent Abbreviation Algorithm For Personal Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`. + + .. versionadded:: 0.3.6 """ - def encode(self, word, max_length=-1, keep_vowels=False, vowel_char='*'): - r"""Return the Dolby Code of a name. + def __init__(self, max_length=-1, keep_vowels=False, vowel_char='*'): + r"""Initialize Dolby instance. Parameters ---------- - word : str - The word to transform max_length : int Maximum length of the returned Dolby code -- this also activates the fixed-length code mode if it is greater than 0 @@ -59,6 +62,55 @@ def encode(self, word, max_length=-1, keep_vowels=False, vowel_char='*'): vowel_char : str The vowel marker character (default to \*) + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._keep_vowels = keep_vowels + self._vowel_char = vowel_char + + def encode_alpha(self, word): + """Return the alphabetic Dolby Code of a name. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Dolby Code + + Examples + -------- + >>> pe = Dolby() + >>> pe.encode_alpha('Hansen') + 'HANSN' + >>> pe.encode_alpha('Larsen') + 'LARSN' + >>> pe.encode_alpha('Aagaard') + 'AGR' + >>> pe.encode_alpha('Braaten') + 'BRADN' + >>> pe.encode_alpha('Sandvik') + 'SANVK' + + + .. versionadded:: 0.4.0 + + """ + return self.encode(word).replace(self._vowel_char, 'A') + + def encode(self, word): + """Return the Dolby Code of a name. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -77,15 +129,17 @@ def encode(self, word, max_length=-1, keep_vowels=False, vowel_char='*'): 'BR*DN' >>> pe.encode('Sandvik') 'S*NVK' - >>> pe.encode('Hansen', max_length=6) + + >>> pe_6 = Dolby(max_length=6) + >>> pe_6.encode('Hansen') 'H*NS*N' - >>> pe.encode('Larsen', max_length=6) + >>> pe_6.encode('Larsen') 'L*RS*N' - >>> pe.encode('Aagaard', max_length=6) + >>> pe_6.encode('Aagaard') '*G*R ' - >>> pe.encode('Braaten', max_length=6) + >>> pe_6.encode('Braaten') 'BR*D*N' - >>> pe.encode('Sandvik', max_length=6) + >>> pe_6.encode('Sandvik') 'S*NF*K' >>> pe.encode('Smith') @@ -98,17 +152,23 @@ def encode(self, word, max_length=-1, keep_vowels=False, vowel_char='*'): 'SM*D' >>> pe.encode('Ashcroft') '*SKRFD' - >>> pe.encode('Smith', max_length=6) + + >>> pe_6.encode('Smith') 'SM*D ' - >>> pe.encode('Waters', max_length=6) + >>> pe_6.encode('Waters') 'W*D*RS' - >>> pe.encode('James', max_length=6) + >>> pe_6.encode('James') 'J*M*S ' - >>> pe.encode('Schmidt', max_length=6) + >>> pe_6.encode('Schmidt') 'SM*D ' - >>> pe.encode('Ashcroft', max_length=6) + >>> pe_6.encode('Ashcroft') '*SKRFD' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) @@ -181,7 +241,7 @@ def encode(self, word, max_length=-1, keep_vowels=False, vowel_char='*'): pos = word.find('K', pos + 1) # Rule FL6 - if max_length > 0 and word[-1:] == 'E': + if self._max_length > 0 and word[-1:] == 'E': word = word[:-1] # Rule 5 (FL7) @@ -200,52 +260,58 @@ def encode(self, word, max_length=-1, keep_vowels=False, vowel_char='*'): word = word.replace('GH', '') # Rule FL9 - if max_length > 0: + if self._max_length > 0: word = word.replace('V', 'F') # Rules 7-9 (FL10-FL12) - first = 1 + (1 if max_length > 0 else 0) + first = 1 + (1 if self._max_length > 0 else 0) code = '' for pos, char in enumerate(word): if char in self._uc_vy_set: - if first or keep_vowels: - code += vowel_char + if first or self._keep_vowels: + code += self._vowel_char first -= 1 elif pos > 0 and char in {'W', 'H'}: continue else: code += char - if max_length > 0: + if self._max_length > 0: # Rule FL13 - if len(code) > max_length and code[-1:] == 'S': + if len(code) > self._max_length and code[-1:] == 'S': code = code[:-1] - if keep_vowels: - code = code[:max_length] + if self._keep_vowels: + code = code[: self._max_length] else: # Rule FL14 - code = code[: max_length + 2] + code = code[: self._max_length + 2] # Rule FL15 - while len(code) > max_length: - vowels = len(code) - max_length + while len(code) > self._max_length: + vowels = len(code) - self._max_length excess = vowels - 1 word = code code = '' for char in word: - if char == vowel_char: + if char == self._vowel_char: if vowels: code += char vowels -= 1 else: code += char - code = code[: max_length + excess] + code = code[: self._max_length + excess] # Rule FL16 - code += ' ' * (max_length - len(code)) + code += ' ' * (self._max_length - len(code)) return code +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Dolby.encode method instead.', +) def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'): r"""Return the Dolby Code of a name. @@ -312,8 +378,10 @@ def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'): >>> dolby('Ashcroft', max_length=6) '*SKRFD' + .. versionadded:: 0.3.0 + """ - return Dolby().encode(word, max_length, keep_vowels, vowel_char) + return Dolby(max_length, keep_vowels, vowel_char).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_double_metaphone.py b/abydos/phonetic/_double_metaphone.py index ebcfcd5c7..4dbeb1e82 100644 --- a/abydos/phonetic/_double_metaphone.py +++ b/abydos/phonetic/_double_metaphone.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['DoubleMetaphone', 'double_metaphone'] @@ -38,18 +41,67 @@ class DoubleMetaphone(_Phonetic): Based on Lawrence Philips' (Visual) C++ code from 1999 :cite:`Philips:2000`. + + .. versionadded:: 0.3.6 """ - def encode(self, word, max_length=-1): + def __init__(self, max_length=-1): + """Initialize DoubleMetaphone instance. + + Parameters + ---------- + max_length : int + Maximum length of the returned Dolby code -- this also activates + the fixed-length code mode if it is greater than 0 + + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + + # Require a max_length of at least 4 + if self._max_length != -1: + self._max_length = max(4, max_length) + + def encode_alpha(self, word): + """Return the alphabetic Double Metaphone code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + tuple + The alphabetic Double Metaphone value(s) + + Examples + -------- + >>> pe = DoubleMetaphone() + >>> pe.encode_alpha('Christopher') + ('KRSTFR', '') + >>> pe.encode_alpha('Niall') + ('NL', '') + >>> pe.encode_alpha('Smith') + ('SMÞ', 'XMT') + >>> pe.encode_alpha('Schmidt') + ('XMT', 'SMT') + + + .. versionadded:: 0.4.0 + + """ + return tuple(code.replace('0', 'Þ') for code in self.encode(word)) + + def encode(self, word): """Return the Double Metaphone code for a word. Parameters ---------- word : str The word to transform - max_length : int - The maximum length of the returned Double Metaphone codes (defaults - to unlmited, but in Philips' original implementation this was 4) Returns ------- @@ -68,11 +120,12 @@ def encode(self, word, max_length=-1): >>> pe.encode('Schmidt') ('XMT', 'SMT') - """ - # Require a max_length of at least 4 - if max_length != -1: - max_length = max(4, max_length) + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ primary = '' secondary = '' @@ -84,6 +137,8 @@ def _slavo_germanic(): bool True if the word appears to be Slavic or Germanic + .. versionadded:: 0.1.0 + """ if 'W' in word or 'K' in word or 'CZ' in word: return True @@ -104,6 +159,8 @@ def _metaph_add(pri, sec=''): tuple A new metaphone tuple with the supplied elements + .. versionadded:: 0.1.0 + """ newpri = primary newsec = secondary @@ -129,6 +186,8 @@ def _is_vowel(pos): bool True if the character is a vowel + .. versionadded:: 0.1.0 + """ if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: return True @@ -147,6 +206,8 @@ def _get_at(pos): str Character at word[pos] + .. versionadded:: 0.1.0 + """ return word[pos] @@ -167,6 +228,8 @@ def _string_at(pos, slen, substrings): bool True if word[pos:pos+slen] is in substrings + .. versionadded:: 0.1.0 + """ if pos < 0: return False @@ -949,15 +1012,21 @@ def _string_at(pos, slen, substrings): else: current += 1 - if max_length > 0: - primary = primary[:max_length] - secondary = secondary[:max_length] + if self._max_length > 0: + primary = primary[: self._max_length] + secondary = secondary[: self._max_length] if primary == secondary: secondary = '' return primary, secondary +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the DoubleMetaphone.encode method instead.', +) def double_metaphone(word, max_length=-1): """Return the Double Metaphone code for a word. @@ -987,8 +1056,10 @@ def double_metaphone(word, max_length=-1): >>> double_metaphone('Schmidt') ('XMT', 'SMT') + .. versionadded:: 0.1.0 + """ - return DoubleMetaphone().encode(word, max_length) + return DoubleMetaphone(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_eudex.py b/abydos/phonetic/_eudex.py index 405ac93a5..f66ac64c8 100644 --- a/abydos/phonetic/_eudex.py +++ b/abydos/phonetic/_eudex.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Eudex', 'eudex'] @@ -42,6 +45,8 @@ class Eudex(_Phonetic): (not the reference implementation) at :cite:`Ticki:2016`. Further details can be found at :cite:`Ticki:2016b`. + + .. versionadded:: 0.3.6 """ _trailing_phones = { @@ -168,15 +173,27 @@ class Eudex(_Phonetic): 'ÿ': 0b11100101, # ÿ } - def encode(self, word, max_length=8): + def __init__(self, max_length=8): + """Initialize Eudex instance. + + Parameters + ---------- + max_length : int + The length in bits of the code returned (default 8) + + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + + def encode(self, word): """Return the eudex phonetic hash of a word. Parameters ---------- word : str The word to transform - max_length : int - The length in bits of the code returned (default 8) Returns ------- @@ -197,6 +214,11 @@ def encode(self, word, max_length=8): >>> pe.encode('Schmidt') 720589151732307997 + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # Lowercase input & filter unknown characters word = ''.join( @@ -220,8 +242,8 @@ def encode(self, word, max_length=8): # Add padding after first character & trim beyond max_length values = ( [condensed_values[0]] - + [0] * max(0, max_length - len(condensed_values)) - + condensed_values[1:max_length] + + [0] * max(0, self._max_length - len(condensed_values)) + + condensed_values[1 : self._max_length] ) # Combine individual character values into eudex hash @@ -232,6 +254,12 @@ def encode(self, word, max_length=8): return hash_value +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Eudex.encode method instead.', +) def eudex(word, max_length=8): """Return the eudex phonetic hash of a word. @@ -262,8 +290,10 @@ def eudex(word, max_length=8): >>> eudex('Schmidt') 720589151732307997 + .. versionadded:: 0.3.0 + """ - return Eudex().encode(word, max_length) + return Eudex(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_fonem.py b/abydos/phonetic/_fonem.py index 86fed8f3f..1122f3bd4 100644 --- a/abydos/phonetic/_fonem.py +++ b/abydos/phonetic/_fonem.py @@ -31,13 +31,20 @@ from re import compile as re_compile from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['FONEM', 'fonem'] +def _get_parts(m): + return (m.group(1) or '') + (m.group(2) or '') + + class FONEM(_Phonetic): """FONEM. @@ -47,6 +54,9 @@ class FONEM(_Phonetic): Guillaume Plique's Javascript implementation :cite:`Plique:2018` at https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js was also consulted for this implementation. + + + .. versionadded:: 0.3.6 """ # I don't see a sane way of doing this without regexps :( @@ -122,7 +132,7 @@ class FONEM(_Phonetic): '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$' ), - lambda m: (m.group(1) or '') + (m.group(2) or ''), + _get_parts, ), 'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), 'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'), @@ -225,6 +235,11 @@ def encode(self, word): >>> pe.encode('Pelletier') 'PELETIER' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # normalize, upper-case, and filter non-French letters word = unicode_normalize('NFKD', text_type(word.upper())) @@ -241,6 +256,12 @@ def encode(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the FONEM.encode method instead.', +) def fonem(word): """Return the FONEM code of a word. @@ -269,6 +290,9 @@ def fonem(word): >>> fonem('Pelletier') 'PELETIER' + + .. versionadded:: 0.3.0 + """ return FONEM().encode(word) diff --git a/abydos/phonetic/_fuzzy_soundex.py b/abydos/phonetic/_fuzzy_soundex.py index 92be6fc8c..da1dc150d 100644 --- a/abydos/phonetic/_fuzzy_soundex.py +++ b/abydos/phonetic/_fuzzy_soundex.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['FuzzySoundex', 'fuzzy_soundex'] @@ -42,6 +45,8 @@ class FuzzySoundex(_Phonetic): Fuzzy Soundex is an algorithm derived from Soundex, defined in :cite:`Holmes:2002`. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -51,19 +56,70 @@ class FuzzySoundex(_Phonetic): ) ) - def encode(self, word, max_length=5, zero_pad=True): - """Return the Fuzzy Soundex code for a word. + _alphabetic = dict(zip((ord(_) for _ in '01345679'), 'APTLNRKS')) + + def __init__(self, max_length=5, zero_pad=True): + """Initialize FuzzySoundex instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to 4) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + # Clamp max_length to [4, 64] + if max_length != -1: + self._max_length = min(max(4, max_length), 64) + else: + self._max_length = 64 + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic Fuzzy Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Fuzzy Soundex value + + Examples + -------- + >>> pe = FuzzySoundex() + >>> pe.encode_alpha('Christopher') + 'KRSTP' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Schmidt') + 'SNT' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, word): + """Return the Fuzzy Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -81,19 +137,18 @@ def encode(self, word, max_length=5, zero_pad=True): >>> pe.encode('Smith') 'S5300' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') - # Clamp max_length to [4, 64] - if max_length != -1: - max_length = min(max(4, max_length), 64) - else: - max_length = 64 - if not word: - if zero_pad: - return '0' * max_length + if self._zero_pad: + return '0' * self._max_length return '0' if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}: @@ -153,12 +208,18 @@ def encode(self, word, max_length=5, zero_pad=True): sdx = sdx.replace('0', '') - if zero_pad: - sdx += '0' * max_length + if self._zero_pad: + sdx += '0' * self._max_length - return sdx[:max_length] + return sdx[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the FuzzySoundex.encode method instead.', +) def fuzzy_soundex(word, max_length=5, zero_pad=True): """Return the Fuzzy Soundex code for a word. @@ -189,8 +250,10 @@ def fuzzy_soundex(word, max_length=5, zero_pad=True): >>> fuzzy_soundex('Smith') 'S5300' + .. versionadded:: 0.1.0 + """ - return FuzzySoundex().encode(word, max_length, zero_pad) + return FuzzySoundex(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_haase.py b/abydos/phonetic/_haase.py index 3f2e450ab..c0bbcd29a 100644 --- a/abydos/phonetic/_haase.py +++ b/abydos/phonetic/_haase.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2018 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -31,10 +31,13 @@ from itertools import product from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Haase', 'haase_phonetik'] @@ -45,11 +48,64 @@ class Haase(_Phonetic): Based on the algorithm described at :cite:`Prante:2015`. Based on the original :cite:`Haase:2000`. + + .. versionadded:: 0.3.6 """ _uc_v_set = set('AEIJOUY') - def encode(self, word, primary_only=False): + _alphabetic = dict(zip((ord(_) for _ in '123456789'), 'PTFKLNRSA')) + + def __init__(self, primary_only=False): + """Initialize Haase instance. + + Parameters + ---------- + primary_only : bool + If True, only the primary code is returned + + + .. versionadded:: 0.4.0 + + """ + self._primary_only = primary_only + + def encode_alpha(self, word): + """Return the alphabetic Haase Phonetik code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + tuple + The alphabetic Haase Phonetik value + + Examples + -------- + >>> pe = Haase() + >>> pe.encode_alpha('Joachim') + ('AKAN',) + >>> pe.encode_alpha('Christoph') + ('KRASTAF', 'SRASTAF') + >>> pe.encode_alpha('Jörg') + ('ARK',) + >>> pe.encode_alpha('Smith') + ('SNAT',) + >>> pe.encode_alpha('Schmidt') + ('SNAT', 'KNAT') + + + .. versionadded:: 0.4.0 + + """ + return tuple( + code.translate(self._alphabetic) for code in self.encode(word) + ) + + def encode(self, word): """Return the Haase Phonetik (numeric output) code for a word. While the output code is numeric, it is nevertheless a str. @@ -58,8 +114,6 @@ def encode(self, word, primary_only=False): ---------- word : str The word to transform - primary_only : bool - If True, only the primary code is returned Returns ------- @@ -80,6 +134,11 @@ def encode(self, word, primary_only=False): >>> pe.encode('Schmidt') ('8692', '4692') + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _after(word, pos, letters): @@ -99,6 +158,8 @@ def _after(word, pos, letters): bool True if word[pos] follows one of letters + .. versionadded:: 0.3.0 + """ if pos > 0 and word[pos - 1] in letters: return True @@ -121,6 +182,8 @@ def _before(word, pos, letters): bool True if word[pos] precedes one of letters + .. versionadded:: 0.3.0 + """ if pos + 1 < len(word) and word[pos + 1] in letters: return True @@ -135,7 +198,7 @@ def _before(word, pos, letters): word = ''.join(c for c in word if c in self._uc_set) variants = [] - if primary_only: + if self._primary_only: variants = [word] else: pos = 0 @@ -245,8 +308,14 @@ def _haase_code(word): return encoded +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Haase.encode method instead.', +) def haase_phonetik(word, primary_only=False): - """Return the Haase Phonetik (numeric output) code for a word. + """Return the Haase Phonetik code for a word. This is a wrapper for :py:meth:`Haase.encode`. @@ -275,8 +344,10 @@ def haase_phonetik(word, primary_only=False): >>> haase_phonetik('Schmidt') ('8692', '4692') + .. versionadded:: 0.3.0 + """ - return Haase().encode(word, primary_only) + return Haase(primary_only).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_henry_early.py b/abydos/phonetic/_henry_early.py index f7e716acd..198ea1a6c 100644 --- a/abydos/phonetic/_henry_early.py +++ b/abydos/phonetic/_henry_early.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['HenryEarly', 'henry_early'] @@ -42,6 +45,8 @@ class HenryEarly(_Phonetic): The early version of Henry coding is given in :cite:`Legare:1972`. This is different from the later version defined in :cite:`Henry:1976`. + + .. versionadded:: 0.3.6 """ _uc_c_set = set('BCDFGHJKLMNPQRSTVWXZ') @@ -56,15 +61,27 @@ class HenryEarly(_Phonetic): } _simple = {'W': 'V', 'X': 'S', 'Z': 'S'} - def encode(self, word, max_length=3): + def __init__(self, max_length=3): + """Initialize HenryEarly instance. + + Parameters + ---------- + max_length : int + The length of the code returned (defaults to 3) + + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + + def encode(self, word): """Calculate the early version of the Henry code for a word. Parameters ---------- word : str The word to transform - max_length : int - The length of the code returned (defaults to 3) Returns ------- @@ -73,17 +90,23 @@ def encode(self, word, max_length=3): Examples -------- - >>> henry_early('Marchand') + >>> pe = HenryEarly() + >>> pe.encode('Marchand') 'MRC' - >>> henry_early('Beaulieu') + >>> pe.encode('Beaulieu') 'BL' - >>> henry_early('Beaumont') + >>> pe.encode('Beaumont') 'BM' - >>> henry_early('Legrand') + >>> pe.encode('Legrand') 'LGR' - >>> henry_early('Pelletier') + >>> pe.encode('Pelletier') 'PLT' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._uc_set) @@ -233,12 +256,18 @@ def encode(self, word, max_length=3): {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''} ) - if max_length != -1: - code = code[:max_length] + if self._max_length != -1: + code = code[: self._max_length] return code +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the HenryEarly.encode method instead.', +) def henry_early(word, max_length=3): """Calculate the early version of the Henry code for a word. @@ -269,8 +298,10 @@ def henry_early(word, max_length=3): >>> henry_early('Pelletier') 'PLT' + .. versionadded:: 0.3.0 + """ - return HenryEarly().encode(word, max_length) + return HenryEarly(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_koelner.py b/abydos/phonetic/_koelner.py index 286f1b8ce..5659ae909 100644 --- a/abydos/phonetic/_koelner.py +++ b/abydos/phonetic/_koelner.py @@ -30,10 +30,13 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = [ 'Koelner', @@ -47,6 +50,8 @@ class Koelner(_Phonetic): """Kölner Phonetik. Based on the algorithm defined by :cite:`Postel:1969`. + + .. versionadded:: 0.3.6 """ _uc_v_set = set('AEIOUJY') @@ -86,6 +91,11 @@ def encode(self, word): >>> pe.encode('Zimmermann') '86766' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _after(word, pos, letters): @@ -105,6 +115,8 @@ def _after(word, pos, letters): bool True if word[pos] follows a value in letters + .. versionadded:: 0.1.0 + """ return pos > 0 and word[pos - 1] in letters @@ -125,6 +137,8 @@ def _before(word, pos, letters): bool True if word[pos] precedes a value in letters + .. versionadded:: 0.1.0 + """ return pos + 1 < len(word) and word[pos + 1] in letters @@ -219,6 +233,11 @@ def _to_alpha(self, num): >>> pe._to_alpha('86766') 'SNRNN' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ num = ''.join(c for c in text_type(num) if c in self._num_set) return num.translate(self._num_trans) @@ -248,10 +267,21 @@ def encode_alpha(self, word): >>> pe.encode_alpha('Zimmermann') 'SNRNN' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Koelner.encode method instead.', +) def koelner_phonetik(word): """Return the Kölner Phonetik (numeric output) code for a word. @@ -282,10 +312,18 @@ def koelner_phonetik(word): >>> koelner_phonetik('Zimmermann') '86766' + .. versionadded:: 0.1.0 + """ return Koelner().encode(word) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Koelner._to_alpha method instead.', +) def koelner_phonetik_num_to_alpha(num): """Convert a Kölner Phonetik code from numeric to alphabetic. @@ -310,10 +348,18 @@ def koelner_phonetik_num_to_alpha(num): >>> koelner_phonetik_num_to_alpha('86766') 'SNRNN' + .. versionadded:: 0.1.0 + """ return Koelner()._to_alpha(num) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Koelner.encode_alpha method instead.', +) def koelner_phonetik_alpha(word): """Return the Kölner Phonetik (alphabetic output) code for a word. @@ -340,6 +386,8 @@ def koelner_phonetik_alpha(word): >>> koelner_phonetik_alpha('Zimmermann') 'SNRNN' + .. versionadded:: 0.1.0 + """ return Koelner().encode_alpha(word) diff --git a/abydos/phonetic/_lein.py b/abydos/phonetic/_lein.py index 9a3c591ed..b449b0e58 100644 --- a/abydos/phonetic/_lein.py +++ b/abydos/phonetic/_lein.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -18,7 +18,7 @@ """abydos.phonetic._lein. -Lein +Michigan LEIN (Law Enforcement Information Network) encoding """ from __future__ import ( @@ -30,17 +30,23 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ + +__all__ = ['LEIN', 'lein'] -__all__ = ['Lein', 'lein'] +class LEIN(_Phonetic): + """LEIN code. -class Lein(_Phonetic): - """Lein code. + This is Michigan LEIN (Law Enforcement Information Network) name coding, + described in :cite:`Moore:1977`. - This is Lein name coding, described in :cite:`Moore:1977`. + .. versionadded:: 0.3.6 """ _trans = dict( @@ -49,27 +55,74 @@ class Lein(_Phonetic): _del_trans = {num: None for num in (32, 65, 69, 72, 73, 79, 85, 87, 89)} - def encode(self, word, max_length=4, zero_pad=True): - """Return the Lein code for a word. + _alphabetic = dict(zip((ord(_) for _ in '12345'), 'TNLPK')) + + def __init__(self, max_length=4, zero_pad=True): + """Initialize LEIN instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to 4) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic LEIN code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic LEIN code + + Examples + -------- + >>> pe = LEIN() + >>> pe.encode_alpha('Christopher') + 'CLKT' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Schmidt') + 'SKNT' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, word): + """Return the LEIN code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str - The Lein code + The LEIN code Examples -------- - >>> pe = Lein() + >>> pe = LEIN() >>> pe.encode('Christopher') 'C351' >>> pe.encode('Niall') @@ -79,6 +132,11 @@ def encode(self, word, max_length=4, zero_pad=True): >>> pe.encode('Schmidt') 'S521' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) @@ -90,16 +148,22 @@ def encode(self, word, max_length=4, zero_pad=True): word = self._delete_consecutive_repeats(word) # Rule 3 code += word.translate(self._trans) # Rule 4 - if zero_pad: - code += '0' * max_length # Rule 4 + if self._zero_pad: + code += '0' * self._max_length # Rule 4 - return code[:max_length] + return code[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the LEIN.encode method instead.', +) def lein(word, max_length=4, zero_pad=True): - """Return the Lein code for a word. + """Return the LEIN code for a word. - This is a wrapper for :py:meth:`Lein.encode`. + This is a wrapper for :py:meth:`LEIN.encode`. Parameters ---------- @@ -113,7 +177,7 @@ def lein(word, max_length=4, zero_pad=True): Returns ------- str - The Lein code + The LEIN code Examples -------- @@ -126,8 +190,10 @@ def lein(word, max_length=4, zero_pad=True): >>> lein('Schmidt') 'S521' + .. versionadded:: 0.3.0 + """ - return Lein().encode(word, max_length, zero_pad) + return LEIN(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_meta_soundex.py b/abydos/phonetic/_meta_soundex.py index 049c03814..582710d80 100644 --- a/abydos/phonetic/_meta_soundex.py +++ b/abydos/phonetic/_meta_soundex.py @@ -28,11 +28,14 @@ unicode_literals, ) +from deprecation import deprecated + from ._metaphone import Metaphone from ._phonetic import _Phonetic from ._phonetic_spanish import PhoneticSpanish from ._soundex import Soundex from ._spanish_metaphone import SpanishMetaphone +from .. import __version__ __all__ = ['MetaSoundex', 'metasoundex'] @@ -42,6 +45,8 @@ class MetaSoundex(_Phonetic): This is based on :cite:`Koneru:2017`. Only English ('en') and Spanish ('es') languages are supported, as in the original. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -50,20 +55,80 @@ class MetaSoundex(_Phonetic): '07430755015866075943077514', ) ) - _phonetic_spanish = PhoneticSpanish() - _spanish_metaphone = SpanishMetaphone() - _metaphone = Metaphone() - _soundex = Soundex() - def encode(self, word, lang='en'): + def __init__(self, lang='en'): + """Initialize MetaSoundex instance. + + Parameters + ---------- + lang : str + Either ``en`` for English or ``es`` for Spanish + + + .. versionadded:: 0.4.0 + + """ + self._lang = lang + if lang == 'en': + self._sdx = Soundex() + self._meta = Metaphone() + else: + self._sdx = PhoneticSpanish() + self._meta = SpanishMetaphone() + + def encode_alpha(self, word): + """Return the MetaSoundex code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The MetaSoundex code + + Examples + -------- + >>> pe = MetaSoundex() + >>> pe.encode_alpha('Smith') + 'SN' + >>> pe.encode_alpha('Waters') + 'WTRK' + >>> pe.encode_alpha('James') + 'JNK' + >>> pe.encode_alpha('Schmidt') + 'SNT' + >>> pe.encode_alpha('Ashcroft') + 'AKRP' + + >>> pe = MetaSoundex(lang='es') + >>> pe.encode_alpha('Perez') + 'PRS' + >>> pe.encode_alpha('Martinez') + 'NRTNS' + >>> pe.encode_alpha('Gutierrez') + 'GTRRS' + >>> pe.encode_alpha('Santiago') + 'SNTG' + >>> pe.encode_alpha('Nicolás') + 'NKLS' + + + .. versionadded:: 0.4.0 + + """ + word = self._sdx.encode_alpha(self._meta.encode_alpha(word)) + return word + + def encode(self, word): """Return the MetaSoundex code for a word. Parameters ---------- word : str The word to transform - lang : str - Either ``en`` for English or ``es`` for Spanish Returns ------- @@ -83,28 +148,37 @@ def encode(self, word, lang='en'): '4530' >>> pe.encode('Ashcroft') '0261' - >>> pe.encode('Perez', lang='es') + + >>> pe = MetaSoundex(lang='es') + >>> pe.encode('Perez') '094' - >>> pe.encode('Martinez', lang='es') + >>> pe.encode('Martinez') '69364' - >>> pe.encode('Gutierrez', lang='es') + >>> pe.encode('Gutierrez') '83994' - >>> pe.encode('Santiago', lang='es') + >>> pe.encode('Santiago') '4638' - >>> pe.encode('Nicolás', lang='es') + >>> pe.encode('Nicolás') '6754' - """ - if lang == 'es': - return self._phonetic_spanish.encode( - self._spanish_metaphone.encode(word) - ) - word = self._soundex.encode(self._metaphone.encode(word)) - word = word[0].translate(self._trans) + word[1:] + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ + word = self._sdx.encode(self._meta.encode(word)) + if self._lang == 'en': + word = word[0].translate(self._trans) + word[1:] return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MetaSoundex.encode method instead.', +) def metasoundex(word, lang='en'): """Return the MetaSoundex code for a word. @@ -145,8 +219,10 @@ def metasoundex(word, lang='en'): >>> metasoundex('Nicolás', lang='es') '6754' + .. versionadded:: 0.3.0 + """ - return MetaSoundex().encode(word, lang) + return MetaSoundex(lang).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_metaphone.py b/abydos/phonetic/_metaphone.py index 05fa65834..48bcdee72 100644 --- a/abydos/phonetic/_metaphone.py +++ b/abydos/phonetic/_metaphone.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Metaphone', 'metaphone'] @@ -42,12 +45,33 @@ class Metaphone(_Phonetic): as described in :cite:`Philips:1990b`. This incorporates some corrections to the above code, particularly some of those suggested by Michael Kuhn in :cite:`Kuhn:1995`. + + .. versionadded:: 0.3.6 """ _frontv = {'E', 'I', 'Y'} _varson = {'C', 'G', 'P', 'S', 'T'} - def encode(self, word, max_length=-1): + def __init__(self, max_length=-1): + """Initialize AlphaSIS instance. + + Parameters + ---------- + max_length : int + The maximum length of the returned Metaphone code (defaults to 64, + but in Philips' original implementation this was 4) + + + .. versionadded:: 0.4.0 + + """ + # Require a max_length of at least 4 + if max_length != -1: + self._max_length = max(4, max_length) + else: + self._max_length = 64 + + def encode(self, word): """Return the Metaphone code for a word. Based on Lawrence Philips' Pick BASIC code from 1990 @@ -59,9 +83,6 @@ def encode(self, word, max_length=-1): ---------- word : str The word to transform - max_length : int - The maximum length of the returned Metaphone code (defaults to 64, - but in Philips' original implementation this was 4) Returns ------- @@ -80,13 +101,12 @@ def encode(self, word, max_length=-1): >>> pe.encode('Schmidt') 'SKMTT' - """ - # Require a max_length of at least 4 - if max_length != -1: - max_length = max(4, max_length) - else: - max_length = 64 + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ # As in variable sound--those modified by adding an "h" ename = ''.join(c for c in word.upper() if c.isalnum()) ename = ename.replace('ß', 'SS') @@ -105,7 +125,7 @@ def encode(self, word, max_length=-1): elen = len(ename) - 1 metaph = '' for i in range(len(ename)): - if len(metaph) >= max_length: + if len(metaph) >= self._max_length: break if ( ename[i] not in {'G', 'T'} @@ -254,6 +274,12 @@ def encode(self, word, max_length=-1): return metaph +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Metaphone.encode method instead.', +) def metaphone(word, max_length=-1): """Return the Metaphone code for a word. @@ -283,8 +309,10 @@ def metaphone(word, max_length=-1): >>> metaphone('Schmidt') 'SKMTT' + .. versionadded:: 0.1.0 + """ - return Metaphone().encode(word, max_length) + return Metaphone(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_mra.py b/abydos/phonetic/_mra.py index c7240338c..9467ee3b8 100644 --- a/abydos/phonetic/_mra.py +++ b/abydos/phonetic/_mra.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['MRA', 'mra'] @@ -38,6 +41,8 @@ class MRA(_Phonetic): A description of the Western Airlines Surname Match Rating Algorithm can be found on page 18 of :cite:`Moore:1977`. + + .. versionadded:: 0.3.6 """ def encode(self, word): @@ -65,6 +70,11 @@ def encode(self, word): >>> pe.encode('Schmidt') 'SCHMDT' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if not word: return word @@ -79,6 +89,12 @@ def encode(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the MRA.encode method instead.', +) def mra(word): """Return the MRA personal numeric identifier (PNI) for a word. @@ -105,6 +121,8 @@ def mra(word): >>> mra('Schmidt') 'SCHMDT' + .. versionadded:: 0.1.0 + """ return MRA().encode(word) diff --git a/abydos/phonetic/_norphone.py b/abydos/phonetic/_norphone.py index 4280dd64f..2d8423a10 100644 --- a/abydos/phonetic/_norphone.py +++ b/abydos/phonetic/_norphone.py @@ -28,8 +28,10 @@ unicode_literals, ) +from deprecation import deprecated from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Norphone', 'norphone'] @@ -43,6 +45,8 @@ class Norphone(_Phonetic): Norphone was designed for Norwegian, but this implementation has been extended to support Swedish vowels as well. This function incorporates the "not implemented" rules from the above file's rule set. + + .. versionadded:: 0.3.6 """ _uc_v_set = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'} @@ -97,6 +101,11 @@ def encode(self, word): >>> pe.encode('Sandvik') 'SNVK' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = word.upper() @@ -155,6 +164,12 @@ def encode(self, word): return code +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Norphone.encode method instead.', +) def norphone(word): """Return the Norphone code. @@ -183,6 +198,8 @@ def norphone(word): >>> norphone('Sandvik') 'SNVK' + .. versionadded:: 0.3.0 + """ return Norphone().encode(word) diff --git a/abydos/phonetic/_nrl.py b/abydos/phonetic/_nrl.py index 03f553883..51defaee2 100644 --- a/abydos/phonetic/_nrl.py +++ b/abydos/phonetic/_nrl.py @@ -30,7 +30,10 @@ from re import match as re_match +from deprecation import deprecated + from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['NRL', 'nrl'] @@ -39,16 +42,19 @@ class NRL(_Phonetic): """Naval Research Laboratory English-to-phoneme encoder. This is defined by :cite:`Elovitz:1976`. + + + .. versionadded:: 0.3.6 """ _rules = { ' ': ( ('', ' ', '', ' '), ('', '-', '', ''), - ('.', '\'S', '', 'z'), - ('#:.E', '\'S', '', 'z'), - ('#', '\'S', '', 'z'), - ('', '\'', '', ''), + ('.', "'S", '', 'z'), + ('#:.E', "'S", '', 'z'), + ('#', "'S", '', 'z'), + ('', "'", '', ''), ('', ',', '', ' '), ('', '.', '', ' '), ('', '?', '', ' '), @@ -124,7 +130,7 @@ class NRL(_Phonetic): ), 'E': ( ('#:', 'E', ' ', ''), - ('\':^', 'E', ' ', ''), + ("':^", 'E', ' ', ''), (' :', 'E', ' ', 'IY'), ('#', 'ED', ' ', 'd'), ('#:', 'E', 'D ', ''), @@ -283,7 +289,7 @@ class NRL(_Phonetic): ('', 'OA', '', 'OW'), (' ', 'ONLY', '', 'OWnlIY'), (' ', 'ONCE', '', 'wAHns'), - ('', 'ON\'T', '', 'OWnt'), + ('', "ON'T", '', 'OWnt'), ('C', 'O', 'N', 'AA'), ('', 'O', 'NG', 'AO'), (' :^', 'O', 'N', 'AH'), @@ -332,7 +338,7 @@ class NRL(_Phonetic): (' ', 'SCH', '', 'sk'), ('', 'S', 'C+', ''), ('#', 'SM', '', 'zm'), - ('#', 'SN', '\'', 'zAXn'), + ('#', 'SN', "'", 'zAXn'), ('', 'S', '', 's'), ), 'T': ( @@ -461,6 +467,11 @@ def encode(self, word): >>> pe.encode('Larsen') 'lAArsEHn' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _to_regex(pattern, left_match=True): @@ -519,6 +530,12 @@ def _to_regex(pattern, left_match=True): return pron +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NRL.encode method instead.', +) def nrl(word): """Return the Naval Research Laboratory phonetic encoding of a word. @@ -549,6 +566,9 @@ def nrl(word): >>> nrl('Larsen') 'lAArsEHn' + + .. versionadded:: 0.3.0 + """ return NRL().encode(word) diff --git a/abydos/phonetic/_nysiis.py b/abydos/phonetic/_nysiis.py index e09c527e0..1ca4af9d9 100644 --- a/abydos/phonetic/_nysiis.py +++ b/abydos/phonetic/_nysiis.py @@ -29,9 +29,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['NYSIIS', 'nysiis'] @@ -44,20 +47,39 @@ class NYSIIS(_Phonetic): The modified version of this algorithm is described in Appendix B of :cite:`Lynch:1977`. + + .. versionadded:: 0.3.6 """ - def encode(self, word, max_length=6, modified=False): - """Return the NYSIIS code for a word. + def __init__(self, max_length=6, modified=False): + """Initialize AlphaSIS instance. Parameters ---------- - word : str - The word to transform max_length : int The maximum length (default 6) of the code to return modified : bool Indicates whether to use USDA modified NYSIIS + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + # Require a max_length of at least 6 + if self._max_length > -1: + self._max_length = max(6, self._max_length) + + self._modified = modified + + def encode(self, word): + """Return the NYSIIS code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -75,22 +97,25 @@ def encode(self, word, max_length=6, modified=False): >>> pe.encode('Schmidt') 'SNAD' - >>> pe.encode('Christopher', max_length=-1) + >>> NYSIIS(max_length=-1).encode('Christopher') 'CRASTAFAR' - >>> pe.encode('Christopher', max_length=8, modified=True) + >>> pe_8m = NYSIIS(max_length=8, modified=True) + >>> pe_8m.encode('Christopher') 'CRASTAFA' - >>> pe.encode('Niall', max_length=8, modified=True) + >>> pe_8m.encode('Niall') 'NAL' - >>> pe.encode('Smith', max_length=8, modified=True) + >>> pe_8m.encode('Smith') 'SNAT' - >>> pe.encode('Schmidt', max_length=8, modified=True) + >>> pe_8m.encode('Schmidt') 'SNAD' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ - # Require a max_length of at least 6 - if max_length > -1: - max_length = max(6, max_length) word = ''.join(c for c in word.upper() if c.isalpha()) word = word.replace('ß', 'SS') @@ -111,7 +136,7 @@ def encode(self, word, max_length=6, modified=False): word = 'FF' + word[2:] elif word[:3] == 'SCH': word = 'SSS' + word[3:] - elif modified: + elif self._modified: if word[:2] == 'WR': word = 'RR' + word[2:] elif word[:2] == 'RH': @@ -121,20 +146,20 @@ def encode(self, word, max_length=6, modified=False): elif word[:1] in self._uc_v_set: word = 'A' + word[1:] - if modified and word[-1:] in {'S', 'Z'}: + if self._modified and word[-1:] in {'S', 'Z'}: word = word[:-1] if ( word[-2:] == 'EE' or word[-2:] == 'IE' - or (modified and word[-2:] == 'YE') + or (self._modified and word[-2:] == 'YE') ): word = word[:-2] + 'Y' elif word[-2:] in {'DT', 'RT', 'RD'}: word = word[:-2] + 'D' elif word[-2:] in {'NT', 'ND'}: - word = word[:-2] + ('N' if modified else 'D') - elif modified: + word = word[:-2] + ('N' if self._modified else 'D') + elif self._modified: if word[-2:] == 'IX': word = word[:-2] + 'ICK' elif word[-2:] == 'EX': @@ -156,7 +181,7 @@ def encode(self, word, max_length=6, modified=False): skip = 1 elif word[i] in self._uc_v_set: word = word[:i] + 'A' + word[i + 1 :] - elif modified and i != len(word) - 1 and word[i] == 'Y': + elif self._modified and i != len(word) - 1 and word[i] == 'Y': word = word[:i] + 'A' + word[i + 1 :] elif word[i] == 'Q': word = word[:i] + 'G' + word[i + 1 :] @@ -168,13 +193,21 @@ def encode(self, word, max_length=6, modified=False): word = word[:i] + 'N' + word[i + 2 :] elif word[i] == 'K': word = word[:i] + 'C' + word[i + 1 :] - elif modified and i == len(word) - 3 and word[i : i + 3] == 'SCH': + elif ( + self._modified + and i == len(word) - 3 + and word[i : i + 3] == 'SCH' + ): word = word[:i] + 'SSA' skip = 2 elif word[i : i + 3] == 'SCH': word = word[:i] + 'SSS' + word[i + 3 :] skip = 2 - elif modified and i == len(word) - 2 and word[i : i + 2] == 'SH': + elif ( + self._modified + and i == len(word) - 2 + and word[i : i + 2] == 'SH' + ): word = word[:i] + 'SA' skip = 1 elif word[i : i + 2] == 'SH': @@ -183,13 +216,13 @@ def encode(self, word, max_length=6, modified=False): elif word[i : i + 2] == 'PH': word = word[:i] + 'FF' + word[i + 2 :] skip = 1 - elif modified and word[i : i + 3] == 'GHT': + elif self._modified and word[i : i + 3] == 'GHT': word = word[:i] + 'TTT' + word[i + 3 :] skip = 2 - elif modified and word[i : i + 2] == 'DG': + elif self._modified and word[i : i + 2] == 'DG': word = word[:i] + 'GG' + word[i + 2 :] skip = 1 - elif modified and word[i : i + 2] == 'WR': + elif self._modified and word[i : i + 2] == 'WR': word = word[:i] + 'RR' + word[i + 2 :] skip = 1 elif word[i] == 'H' and ( @@ -211,15 +244,21 @@ def encode(self, word, max_length=6, modified=False): key = key[:-2] + 'Y' if key[-1:] == 'A': key = key[:-1] - if modified and key[:1] == 'A': + if self._modified and key[:1] == 'A': key = original_first_char + key[1:] - if max_length > 0: - key = key[:max_length] + if self._max_length > 0: + key = key[: self._max_length] return key +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the NYSIIS.encode method instead.', +) def nysiis(word, max_length=6, modified=False): """Return the NYSIIS code for a word. @@ -262,8 +301,10 @@ def nysiis(word, max_length=6, modified=False): >>> nysiis('Schmidt', max_length=8, modified=True) 'SNAD' + .. versionadded:: 0.1.0 + """ - return NYSIIS().encode(word, max_length, modified) + return NYSIIS(max_length, modified).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_onca.py b/abydos/phonetic/_onca.py index 8a88949b0..34c1b6e44 100644 --- a/abydos/phonetic/_onca.py +++ b/abydos/phonetic/_onca.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from ._nysiis import NYSIIS from ._phonetic import _Phonetic from ._soundex import Soundex +from .. import __version__ __all__ = ['ONCA', 'onca'] @@ -44,24 +47,67 @@ class ONCA(_Phonetic): method" identified as the first step in this algorithm, so this is likely not a precisely correct implementation, in that it employs the standard NYSIIS algorithm. + + .. versionadded:: 0.3.6 """ - _nysiis = NYSIIS() - _soundex = Soundex() - - def encode(self, word, max_length=4, zero_pad=True): - """Return the Oxford Name Compression Algorithm (ONCA) code for a word. + def __init__(self, max_length=4, zero_pad=True): + """Initialize ONCA instance. Parameters ---------- - word : str - The word to transform max_length : int The maximum length (default 5) of the code to return zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + self._nysiis = NYSIIS(max_length=max_length * 3) + self._soundex = Soundex(max_length=max_length, zero_pad=zero_pad) + + def encode_alpha(self, word): + """Return the alphabetic ONCA code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic ONCA code + + Examples + -------- + >>> pe = ONCA() + >>> pe.encode_alpha('Christopher') + 'CRKT' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Schmidt') + 'SNT' + + + .. versionadded:: 0.4.0 + + """ + return self._soundex.encode_alpha(self._nysiis.encode_alpha(word)) + + def encode(self, word): + """Return the Oxford Name Compression Algorithm (ONCA) code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -79,17 +125,24 @@ def encode(self, word, max_length=4, zero_pad=True): >>> pe.encode('Schmidt') 'S530' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # In the most extreme case, 3 characters of NYSIIS input can be # compressed to one character of output, so give it triple the # max_length. - return self._soundex.encode( - self._nysiis.encode(word, max_length=max_length * 3), - max_length, - zero_pad=zero_pad, - ) + return self._soundex.encode(self._nysiis.encode(word)) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the ONCA.encode method instead.', +) def onca(word, max_length=4, zero_pad=True): """Return the Oxford Name Compression Algorithm (ONCA) code for a word. @@ -120,8 +173,10 @@ def onca(word, max_length=4, zero_pad=True): >>> onca('Schmidt') 'S530' + .. versionadded:: 0.3.0 + """ - return ONCA().encode(word, max_length, zero_pad) + return ONCA(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_parmar_kumbharana.py b/abydos/phonetic/_parmar_kumbharana.py index a5f8f9884..c8790b0e7 100644 --- a/abydos/phonetic/_parmar_kumbharana.py +++ b/abydos/phonetic/_parmar_kumbharana.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['ParmarKumbharana', 'parmar_kumbharana'] @@ -39,6 +42,8 @@ class ParmarKumbharana(_Phonetic): """Parmar-Kumbharana code. This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`. + + .. versionadded:: 0.3.6 """ _rules = { @@ -88,6 +93,11 @@ def encode(self, word): >>> pe.encode('judge') 'JJ' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = word.upper() # Rule 3 word = self._delete_consecutive_repeats(word) # Rule 4 @@ -108,6 +118,12 @@ def encode(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the ParmarKumbharana.encode method instead.', +) def parmar_kumbharana(word): """Return the Parmar-Kumbharana encoding of a word. @@ -136,6 +152,8 @@ def parmar_kumbharana(word): >>> parmar_kumbharana('judge') 'JJ' + .. versionadded:: 0.3.0 + """ return ParmarKumbharana().encode(word) diff --git a/abydos/phonetic/_phonem.py b/abydos/phonetic/_phonem.py index fdb3c90f0..a7453dc5f 100644 --- a/abydos/phonetic/_phonem.py +++ b/abydos/phonetic/_phonem.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Phonem', 'phonem'] @@ -48,6 +51,8 @@ class Phonem(_Phonetic): :cite:`dcm4che:2011`. Phonem is intended chiefly for German names/words. + + .. versionadded:: 0.3.6 """ _substitutions = ( @@ -104,6 +109,11 @@ def encode(self, word): >>> pe.encode('Schmidt') 'CMYD' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFC', text_type(word.upper())) for i, j in self._substitutions: @@ -117,6 +127,12 @@ def encode(self, word): ) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Phonem.encode method instead.', +) def phonem(word): """Return the Phonem code for a word. @@ -143,6 +159,8 @@ def phonem(word): >>> phonem('Schmidt') 'CMYD' + .. versionadded:: 0.1.0 + """ return Phonem().encode(word) diff --git a/abydos/phonetic/_phonet.py b/abydos/phonetic/_phonet.py index 939e19dc8..eb7d55448 100644 --- a/abydos/phonetic/_phonet.py +++ b/abydos/phonetic/_phonet.py @@ -31,10 +31,13 @@ from collections import Counter from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Phonet', 'phonet'] @@ -50,6 +53,8 @@ class Phonet(_Phonetic): That is, in turn, based on Michael's C code, which is also licensed LGPL :cite:`Michael:2007`. + + .. versionadded:: 0.3.6 """ _rules_no_lang = ( # separator chars @@ -57,7 +62,7 @@ class Phonet(_Phonetic): '´', ' ', ' ', '"', ' ', ' ', '`$', '', '', - '\'', ' ', ' ', + "'", ' ', ' ', ',', ',', ',', ';', ',', ',', '-', ' ', ' ', @@ -104,9 +109,9 @@ class Phonet(_Phonetic): 'MC^', 'MAC', 'MAC', 'MC^', 'MAC', 'MAC', 'M´^', 'MAC', 'MAC', - 'M\'^', 'MAC', 'MAC', + "M'^", 'MAC', 'MAC', 'O´^', 'O', 'O', - 'O\'^', 'O', 'O', + "O'^", 'O', 'O', 'VAN DEN ^', 'VANDEN', 'VANDEN', None, None, None # fmt: on @@ -117,7 +122,7 @@ class Phonet(_Phonetic): '´', ' ', ' ', '"', ' ', ' ', '`$', '', '', - '\'', ' ', ' ', + "'", ' ', ' ', ',', ' ', ' ', ';', ' ', ' ', '-', ' ', ' ', @@ -299,7 +304,7 @@ class Phonet(_Phonetic): 'CERST(EI)----^', 'KE', 'KE', 'CER$', 'ZA', 'ZA', 'CE3', 'ZE', 'ZE', - 'CH\'S$', 'X', 'X', + "CH'S$", 'X', 'X', 'CH´S$', 'X', 'X', 'CHAO(ST)-', 'KAO', 'KAU', 'CHAMPIO-^', 'SHEMPI', 'ZENBI', @@ -345,14 +350,14 @@ class Phonet(_Phonetic): 'CST', 'XT', 'XT', 'CS<^', 'Z', 'Z', 'C(SßX)', 'X', 'X', - 'CT\'S$', 'X', 'X', + "CT'S$", 'X', 'X', 'CT(SßXZ)', 'X', 'X', 'CZ<', 'Z', 'Z', 'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z', 'C.^', 'C.', 'C.', 'CÄ-', 'Z', 'Z', 'CÜ$', 'ZÜ', 'ZI', - 'C\'S$', 'X', 'X', + "C'S$", 'X', 'X', 'C<', 'K', 'K', 'DAHER^$', 'DAHER', None, 'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ', @@ -382,10 +387,10 @@ class Phonet(_Phonetic): 'D(SßZ)', 'Z', 'Z', 'D(AÄEIOÖRUÜY)-', 'D', None, 'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None, - 'D\'H^', 'D', 'T', + "D'H^", 'D', 'T', 'D´H^', 'D', 'T', 'D`H^', 'D', 'T', - 'D\'S3$', 'Z', 'Z', + "D'S3$", 'Z', 'Z', 'D´S3$', 'Z', 'Z', 'D^', 'D', None, 'D', 'T', 'T', @@ -519,7 +524,7 @@ class Phonet(_Phonetic): 'GY9^', 'GÜ', None, 'G(AÄEILOÖRUÜY)-', 'G', None, 'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None, - 'G\'S$', 'X', 'X', + "G'S$", 'X', 'X', 'G´S$', 'X', 'X', 'G^', 'G', None, 'G', 'K', 'K', @@ -619,18 +624,18 @@ class Phonet(_Phonetic): 'KSCH---', 'K', 'K', 'KSH--', 'K', 'K', 'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT' - 'KT\'S$', 'X', 'X', + "KT'S$", 'X', 'X', 'KTI(AIOU)-3', 'XI', 'XI', 'KT(SßXZ)', 'X', 'X', 'KY9^', 'KÜ', None, - 'K\'S$', 'X', 'X', + "K'S$", 'X', 'X', 'K´S$', 'X', 'X', 'LANGES$', ' LANGES', ' LANKEZ', 'LANGE$', ' LANGE', ' LANKE', 'LANG$', ' LANK', ' LANK', 'LARVE-', 'LARF', 'LARF', 'LD(SßZ)$', 'LS', 'LZ', - 'LD\'S$', 'LS', 'LZ', + "LD'S$", 'LS', 'LZ', 'LD´S$', 'LS', 'LZ', 'LEAND-^', 'LEAN', 'LEAN', 'LEERSTEHE-----^', 'LER ', 'LER ', @@ -647,7 +652,7 @@ class Phonet(_Phonetic): 'LIC$', 'LIZ', 'LIZ', 'LIVE^$', 'LEIF', 'LEIF', 'LT(SßZ)$', 'LS', 'LZ', - 'LT\'S$', 'LS', 'LZ', + "LT'S$", 'LS', 'LZ', 'LT´S$', 'LS', 'LZ', 'LUI(GS)--', 'LU', 'LU', 'LV(AIO)-', 'LW', None, @@ -683,9 +688,9 @@ class Phonet(_Phonetic): 'MY9^', 'MÜ', None, 'M(ßZ)$', 'MS', None, 'M´G7^', 'MAK', 'NAK', - 'M\'G7^', 'MAK', 'NAK', + "M'G7^", 'MAK', 'NAK', 'M´^', 'MAK', 'NAK', - 'M\'^', 'MAK', 'NAK', + "M'^", 'MAK', 'NAK', 'M', None, 'N', 'NACH^^', 'NACH', 'NAK', 'NADINE', 'NADIN', 'NATIN', @@ -701,7 +706,7 @@ class Phonet(_Phonetic): 'NDRO(CDKTZ)-', 'NTRO', None, 'ND(BFGJLMNPQVW)-', 'NT', None, 'ND(SßZ)$', 'NS', 'NZ', - 'ND\'S$', 'NS', 'NZ', + "ND'S$", 'NS', 'NZ', 'ND´S$', 'NS', 'NZ', 'NEBEN^^', 'NEBN', 'NEBN', 'NENGELERN------', 'NEN ', 'NEN ', @@ -724,7 +729,7 @@ class Phonet(_Phonetic): 'NTI(AIOU)-3', 'NZI', 'NZI', 'NTIEL--3', 'NZI', 'NZI', 'NT(SßZ)$', 'NS', 'NZ', - 'NT\'S$', 'NS', 'NZ', + "NT'S$", 'NS', 'NZ', 'NT´S$', 'NS', 'NZ', 'NYLON', 'NEILON', 'NEILUN', 'NY9^', 'NÜ', None, @@ -770,7 +775,7 @@ class Phonet(_Phonetic): 'O(JY)<', 'EU', 'EU', 'OZ$', 'OS', None, 'O´^', 'O', 'U', - 'O\'^', 'O', 'U', + "O'^", 'O', 'U', 'O', None, 'U', 'PATIEN--^', 'PAZI', 'PAZI', 'PENSIO-^', 'PANSI', 'PANZI', @@ -877,7 +882,7 @@ class Phonet(_Phonetic): 'STEPHEN-^$', 'STEW', None, 'STERN', 'STERN', None, 'STRAF^^', 'STRAF', 'ZTRAF', - 'ST\'S$', 'Z', 'Z', + "ST'S$", 'Z', 'Z', 'ST´S$', 'Z', 'Z', 'STST--', '', '', 'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT', @@ -932,7 +937,7 @@ class Phonet(_Phonetic): 'TX(AEIOU)-3', 'SH', 'Z', 'TY9^', 'TÜ', None, 'TZ-', '', '', - 'T\'S3$', 'Z', 'Z', + "T'S3$", 'Z', 'Z', 'T´S3$', 'Z', 'Z', 'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ', 'UEBER^^', 'ÜBA', 'IBA', @@ -1072,18 +1077,31 @@ class Phonet(_Phonetic): ) ) - def encode(self, word, mode=1, lang='de'): - """Return the phonet code for a word. + def __init__(self, mode=1, lang='de'): + """Initialize AlphaSIS instance. Parameters ---------- - word : str - The word to transform mode : int The ponet variant to employ (1 or 2) lang : str ``de`` (default) for German, ``none`` for no language + + .. versionadded:: 0.4.0 + + """ + self._mode = mode + self._lang = lang + + def encode(self, word): + """Return the phonet code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -1101,24 +1119,31 @@ def encode(self, word, mode=1, lang='de'): >>> pe.encode('Schmidt') 'SHMIT' - >>> pe.encode('Christopher', mode=2) + >>> pe2 = Phonet(mode=2) + >>> pe2.encode('Christopher') 'KRIZTUFA' - >>> pe.encode('Niall', mode=2) + >>> pe2.encode('Niall') 'NIAL' - >>> pe.encode('Smith', mode=2) + >>> pe2.encode('Smith') 'ZNIT' - >>> pe.encode('Schmidt', mode=2) + >>> pe2.encode('Schmidt') 'ZNIT' - >>> pe.encode('Christopher', lang='none') + >>> pe_none = Phonet(lang='none') + >>> pe_none.encode('Christopher') 'CHRISTOPHER' - >>> pe.encode('Niall', lang='none') + >>> pe_none.encode('Niall') 'NIAL' - >>> pe.encode('Smith', lang='none') + >>> pe_none.encode('Smith') 'SMITH' - >>> pe.encode('Schmidt', lang='none') + >>> pe_none.encode('Schmidt') 'SCHMIDT' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ phonet_hash = Counter() alpha_pos = Counter() @@ -1134,6 +1159,8 @@ def _initialize_phonet(lang): lang : str Language to use for rules + .. versionadded:: 0.1.0 + """ if lang == 'none': _phonet_rules = self._rules_no_lang @@ -1259,6 +1286,8 @@ def _phonet(term, mode, lang): str The phonet value + .. versionadded:: 0.1.0 + """ if lang == 'none': _phonet_rules = self._rules_no_lang @@ -1716,12 +1745,18 @@ def _phonet(term, mode, lang): return dest - _initialize_phonet(lang) + _initialize_phonet(self._lang) word = unicode_normalize('NFKC', text_type(word)) - return _phonet(word, mode, lang) + return _phonet(word, self._mode, self._lang) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Phonet.encode method instead.', +) def phonet(word, mode=1, lang='de'): """Return the phonet code for a word. @@ -1770,8 +1805,10 @@ def phonet(word, mode=1, lang='de'): >>> phonet('Schmidt', lang='none') 'SCHMIDT' + .. versionadded:: 0.1.0 + """ - return Phonet().encode(word, mode, lang) + return Phonet(mode, lang).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_phonetic.py b/abydos/phonetic/_phonetic.py index 26c65a10d..0fa1aef6b 100644 --- a/abydos/phonetic/_phonetic.py +++ b/abydos/phonetic/_phonetic.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -30,9 +30,14 @@ from itertools import groupby +__all__ = ['_Phonetic'] + class _Phonetic(object): - """Abstract Phonetic class.""" + """Abstract Phonetic class. + + .. versionadded:: 0.3.6 + """ _uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ') _lc_set = set('abcdefghijklmnopqrstuvwxyz') @@ -65,6 +70,11 @@ def _delete_consecutive_repeats(self, word): >>> pe._delete_consecutive_repeats('AAACCCTTTGGG') 'ACTG' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return ''.join(char for char, _ in groupby(word)) @@ -76,6 +86,9 @@ def encode(self, word): word : str The word to transform + + .. versionadded:: 0.3.6 + """ pass @@ -92,6 +105,9 @@ def encode_alpha(self, word): str The word transformed + + .. versionadded:: 0.3.6 + """ return self.encode(word) diff --git a/abydos/phonetic/_phonetic_spanish.py b/abydos/phonetic/_phonetic_spanish.py index 4a86bf1ce..c4d82e4f2 100644 --- a/abydos/phonetic/_phonetic_spanish.py +++ b/abydos/phonetic/_phonetic_spanish.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['PhoneticSpanish', 'phonetic_spanish'] @@ -42,6 +45,8 @@ class PhoneticSpanish(_Phonetic): This follows the coding described in :cite:`Amon:2012` and :cite:`delPilarAngeles:2015`. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -50,15 +55,62 @@ class PhoneticSpanish(_Phonetic): _uc_set = set('BCDFGHJKLMNPQRSTVXYZ') - def encode(self, word, max_length=-1): + _alphabetic = dict(zip((ord(_) for _ in '0123456789'), 'PBFTSLNKGR')) + + def __init__(self, max_length=-1): + """Initialize PhoneticSpanish instance. + + Parameters + ---------- + max_length : int + The length of the code returned (defaults to unlimited) + + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + + def encode_alpha(self, word): + """Return the alphabetic PhoneticSpanish coding of word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic PhoneticSpanish code + + Examples + -------- + >>> pe = PhoneticSpanish() + >>> pe.encode_alpha('Perez') + 'PRS' + >>> pe.encode_alpha('Martinez') + 'NRTNS' + >>> pe.encode_alpha('Gutierrez') + 'GTRRS' + >>> pe.encode_alpha('Santiago') + 'SNTG' + >>> pe.encode_alpha('Nicolás') + 'NSLS' + + + .. versionadded:: 0.4.0 + + """ + return self.encode(word).translate(self._alphabetic) + + def encode(self, word): """Return the PhoneticSpanish coding of word. Parameters ---------- word : str The word to transform - max_length : int - The length of the code returned (defaults to unlimited) Returns ------- @@ -79,6 +131,11 @@ def encode(self, word, max_length=-1): >>> pe.encode('Nicolás') '6454' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # uppercase, normalize, and decompose, filter to A-Z minus vowels & W word = unicode_normalize('NFKD', text_type(word.upper())) @@ -91,12 +148,18 @@ def encode(self, word, max_length=-1): # apply the Soundex algorithm sdx = word.translate(self._trans) - if max_length > 0: - sdx = (sdx + ('0' * max_length))[:max_length] + if self._max_length > 0: + sdx = (sdx + ('0' * self._max_length))[: self._max_length] return sdx +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the PhoneticSpanish.encode method instead.', +) def phonetic_spanish(word, max_length=-1): """Return the PhoneticSpanish coding of word. @@ -127,8 +190,10 @@ def phonetic_spanish(word, max_length=-1): >>> phonetic_spanish('Nicolás') '6454' + .. versionadded:: 0.3.0 + """ - return PhoneticSpanish().encode(word, max_length) + return PhoneticSpanish(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_phonex.py b/abydos/phonetic/_phonex.py index 567dd34b7..155839358 100644 --- a/abydos/phonetic/_phonex.py +++ b/abydos/phonetic/_phonex.py @@ -30,10 +30,13 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Phonex', 'phonex'] @@ -42,21 +45,74 @@ class Phonex(_Phonetic): """Phonex code. Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`. + + .. versionadded:: 0.3.6 """ - def encode(self, word, max_length=4, zero_pad=True): - """Return the Phonex code for a word. + _alphabetic = dict(zip((ord(_) for _ in '123456'), 'PSTLNR')) + + def __init__(self, max_length=4, zero_pad=True): + """Initialize Phonex instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to 4) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + # Clamp max_length to [4, 64] + if max_length != -1: + self._max_length = min(max(4, max_length), 64) + else: + self._max_length = 64 + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic Phonex code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Phonex value + + Examples + -------- + >>> pe = Phonex() + >>> pe.encode_alpha('Christopher') + 'CRST' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Schmidt') + 'SSNT' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, word): + """Return the Phonex code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -74,16 +130,15 @@ def encode(self, word, max_length=4, zero_pad=True): >>> pe.encode('Smith') 'S530' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ name = unicode_normalize('NFKD', text_type(word.upper())) name = name.replace('ß', 'SS') - # Clamp max_length to [4, 64] - if max_length != -1: - max_length = min(max(4, max_length), 64) - else: - max_length = 64 - name_code = last = '' # Deletions effected by replacing with next letter which @@ -157,13 +212,19 @@ def encode(self, word, max_length=4, zero_pad=True): last = name_code[-1] - if zero_pad: - name_code += '0' * max_length + if self._zero_pad: + name_code += '0' * self._max_length if not name_code: name_code = '0' - return name_code[:max_length] + return name_code[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Phonex.encode method instead.', +) def phonex(word, max_length=4, zero_pad=True): """Return the Phonex code for a word. @@ -194,8 +255,10 @@ def phonex(word, max_length=4, zero_pad=True): >>> phonex('Smith') 'S530' + .. versionadded:: 0.1.0 + """ - return Phonex().encode(word, max_length, zero_pad) + return Phonex(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_phonix.py b/abydos/phonetic/_phonix.py index f2fd2eb19..03b852a83 100644 --- a/abydos/phonetic/_phonix.py +++ b/abydos/phonetic/_phonix.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Phonix', 'phonix'] @@ -46,6 +49,8 @@ class Phonix(_Phonetic): - :cite:`Pfeifer:2000` - :cite:`Christen:2011` - :cite:`Kollar:2007` + + .. versionadded:: 0.3.6 """ _uc_c_set = None @@ -59,8 +64,23 @@ class Phonix(_Phonetic): ) ) - def __init__(self): - """Initialize Phonix.""" + _alphabetic = dict(zip((ord(_) for _ in '012345678'), 'APKTLNRFS')) + + def __init__(self, max_length=4, zero_pad=True): + """Initialize Phonix instance. + + Parameters + ---------- + max_length : int + The length of the code returned (defaults to 4) + zero_pad : bool + Pad the end of the return value with 0s to achieve a max_length + string + + + .. versionadded:: 0.3.6 + + """ self._uc_c_set = ( super(Phonix, self)._uc_set - super(Phonix, self)._uc_v_set ) @@ -179,18 +199,53 @@ def __init__(self): (3, 'MPT', 'MT'), ) - def encode(self, word, max_length=4, zero_pad=True): + # Clamp max_length to [4, 64] + if max_length != -1: + self._max_length = min(max(4, max_length), 64) + else: + self._max_length = 64 + + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic Phonix code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Phonix value + + Examples + -------- + >>> pe = Phonix() + >>> pe.encode_alpha('Christopher') + 'KRST' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Schmidt') + 'SNT' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, word): """Return the Phonix code for a word. Parameters ---------- word : str The word to transform - max_length : int - The length of the code returned (defaults to 4) - zero_pad : bool - Pad the end of the return value with 0s to achieve a max_length - string Returns ------- @@ -209,6 +264,11 @@ def encode(self, word, max_length=4, zero_pad=True): >>> pe.encode('Schmidt') 'S530' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _start_repl(word, src, tar, post=None): @@ -230,6 +290,8 @@ def _start_repl(word, src, tar, post=None): str Modified string + .. versionadded:: 0.1.0 + """ if post: for i in post: @@ -258,6 +320,8 @@ def _end_repl(word, src, tar, pre=None): str Modified string + .. versionadded:: 0.1.0 + """ if pre: for i in pre: @@ -288,6 +352,8 @@ def _mid_repl(word, src, tar, pre=None, post=None): str Modified string + .. versionadded:: 0.1.0 + """ if pre or post: if not pre: @@ -320,6 +386,8 @@ def _all_repl(word, src, tar, pre=None, post=None): str Modified string + .. versionadded:: 0.1.0 + """ if pre or post: if post: @@ -354,19 +422,19 @@ def _all_repl(word, src, tar, pre=None, post=None): sdx = self._delete_consecutive_repeats(sdx) sdx = sdx.replace('0', '') - # Clamp max_length to [4, 64] - if max_length != -1: - max_length = min(max(4, max_length), 64) - else: - max_length = 64 - - if zero_pad: - sdx += '0' * max_length + if self._zero_pad: + sdx += '0' * self._max_length if not sdx: sdx = '0' - return sdx[:max_length] + return sdx[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Phonix.encode method instead.', +) def phonix(word, max_length=4, zero_pad=True): """Return the Phonix code for a word. @@ -397,8 +465,10 @@ def phonix(word, max_length=4, zero_pad=True): >>> phonix('Schmidt') 'S530' + .. versionadded:: 0.1.0 + """ - return Phonix().encode(word, max_length, zero_pad) + return Phonix(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_pshp_soundex_first.py b/abydos/phonetic/_pshp_soundex_first.py index 67d978bf8..707e5cd72 100644 --- a/abydos/phonetic/_pshp_soundex_first.py +++ b/abydos/phonetic/_pshp_soundex_first.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['PSHPSoundexFirst', 'pshp_soundex_first'] @@ -46,6 +49,8 @@ class PSHPSoundexFirst(_Phonetic): :cite:`Hershberg:1979`. A separate class, :py:class:`PSHPSoundexLast` is used for last names. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -55,18 +60,81 @@ class PSHPSoundexFirst(_Phonetic): ) ) - def encode(self, fname, max_length=4, german=False): - """Calculate the PSHP Soundex/Viewex Coding of a first name. + _alphabetic = dict(zip((ord(_) for _ in '12345'), 'PKTLN')) + + def __init__(self, max_length=4, german=False): + """Initialize PSHPSoundexFirst instance. Parameters ---------- - fname : str - The first name to encode max_length : int The length of the code returned (defaults to 4) german : bool Set to True if the name is German (different rules apply) + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._german = german + + def encode_alpha(self, fname): + """Calculate the alphabetic PSHP Soundex/Viewex Coding of a first name. + + Parameters + ---------- + fname : str + The first name to encode + + Returns + ------- + str + The alphabetic PSHP Soundex/Viewex Coding + + Examples + -------- + >>> pe = PSHPSoundexFirst() + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Waters') + 'WTNK' + >>> pe.encode_alpha('James') + 'JN' + >>> pe.encode_alpha('Schmidt') + 'SN' + >>> pe.encode_alpha('Ashcroft') + 'AKK' + >>> pe.encode_alpha('John') + 'JN' + >>> pe.encode_alpha('Colin') + 'KL' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Sally') + 'SL' + >>> pe.encode_alpha('Jane') + 'JN' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(fname).rstrip('0') + if code == 'J7': + return 'JN' + elif code == 'P7': + return 'PT' + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, fname): + """Calculate the PSHP Soundex/Viewex Coding of a first name. + + Parameters + ---------- + fname : str + The first name to encode + Returns ------- str @@ -96,6 +164,11 @@ def encode(self, fname, max_length=4, german=False): >>> pe.encode('Jane') 'J500' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ fname = unicode_normalize('NFKD', text_type(fname.upper())) fname = fname.replace('ß', 'SS') @@ -125,7 +198,7 @@ def encode(self, fname, max_length=4, german=False): elif fname[:3] in {'WIE', 'WEI'}: fname = 'V' + fname[1:] - if german and fname[:1] in {'W', 'M', 'Y', 'Z'}: + if self._german and fname[:1] in {'W', 'M', 'Y', 'Z'}: fname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[ fname[0] ] + fname[1:] @@ -145,15 +218,21 @@ def encode(self, fname, max_length=4, german=False): code = code.replace('0', '') # rule 1 - if max_length != -1: - if len(code) < max_length: - code += '0' * (max_length - len(code)) + if self._max_length != -1: + if len(code) < self._max_length: + code += '0' * (self._max_length - len(code)) else: - code = code[:max_length] + code = code[: self._max_length] return code +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the PSHPSoundexFirst.encode method instead.', +) def pshp_soundex_first(fname, max_length=4, german=False): """Calculate the PSHP Soundex/Viewex Coding of a first name. @@ -196,8 +275,10 @@ def pshp_soundex_first(fname, max_length=4, german=False): >>> pshp_soundex_first('Jane') 'J500' + .. versionadded:: 0.3.0 + """ - return PSHPSoundexFirst().encode(fname, max_length, german) + return PSHPSoundexFirst(max_length, german).encode(fname) if __name__ == '__main__': diff --git a/abydos/phonetic/_pshp_soundex_last.py b/abydos/phonetic/_pshp_soundex_last.py index 0841e1ca1..a61a529c9 100644 --- a/abydos/phonetic/_pshp_soundex_last.py +++ b/abydos/phonetic/_pshp_soundex_last.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['PSHPSoundexLast', 'pshp_soundex_last'] @@ -46,6 +49,8 @@ class PSHPSoundexLast(_Phonetic): :cite:`Hershberg:1979`. A separate function, :py:class:`PSHPSoundexFirst` is used for first names. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -55,18 +60,67 @@ class PSHPSoundexLast(_Phonetic): ) ) - def encode(self, lname, max_length=4, german=False): - """Calculate the PSHP Soundex/Viewex Coding of a last name. + _alphabetic = dict(zip((ord(_) for _ in '12345'), 'PKTLN')) + + def __init__(self, max_length=4, german=False): + """Initialize PSHPSoundexLast instance. Parameters ---------- - lname : str - The last name to encode max_length : int The length of the code returned (defaults to 4) german : bool Set to True if the name is German (different rules apply) + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._german = german + + def encode_alpha(self, lname): + """Calculate the alphabetic PSHP Soundex/Viewex Coding of a last name. + + Parameters + ---------- + lname : str + The last name to encode + + Returns + ------- + str + The PSHP alphabetic Soundex/Viewex Coding + + Examples + -------- + >>> pe = PSHPSoundexLast() + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Waters') + 'WTN' + >>> pe.encode_alpha('James') + 'JN' + >>> pe.encode_alpha('Schmidt') + 'SNT' + >>> pe.encode_alpha('Ashcroft') + 'AKKN' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(lname).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, lname): + """Calculate the PSHP Soundex/Viewex Coding of a last name. + + Parameters + ---------- + lname : str + The last name to encode + Returns ------- str @@ -86,6 +140,11 @@ def encode(self, lname, max_length=4, german=False): >>> pe.encode('Ashcroft') 'A225' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ lname = unicode_normalize('NFKD', text_type(lname.upper())) lname = lname.replace('ß', 'SS') @@ -100,7 +159,7 @@ def encode(self, lname, max_length=4, german=False): # 1 indicates "except in German data"). It doesn't make sense for them # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately, # both articles have this error(?). - if not german: + if not self._german: if lname[:3] == 'MAC': lname = 'M' + lname[3:] elif lname[:2] == 'MC': @@ -126,7 +185,7 @@ def encode(self, lname, max_length=4, german=False): elif lname[:3] in {'WIE', 'WEI'}: lname = 'V' + lname[1:] - if german and lname[:1] in {'W', 'M', 'Y', 'Z'}: + if self._german and lname[:1] in {'W', 'M', 'Y', 'Z'}: lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[ 1: ] @@ -134,7 +193,7 @@ def encode(self, lname, max_length=4, german=False): code = lname[:1] # B. Postfix treatment - if german: # moved from end of postfix treatment due to blocking + if self._german: # moved from end of postfix treatment due to blocking if lname[-3:] == 'TES': lname = lname[:-3] elif lname[-2:] == 'TS': @@ -157,7 +216,7 @@ def encode(self, lname, max_length=4, german=False): elif lname[-1:] == 'S': lname = lname[:-1] - if not german: + if not self._german: l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'} l4_repl = { 'NSEN': 'ASEN', @@ -172,7 +231,7 @@ def encode(self, lname, max_length=4, german=False): if lname[-2:] in {'NG', 'ND'}: lname = lname[:-1] - if not german and lname[-3:] in {'GAN', 'GEN'}: + if not self._german and lname[-3:] in {'GAN', 'GEN'}: lname = lname[:-3] + 'A' + lname[-2:] # C. Infix Treatment @@ -195,15 +254,21 @@ def encode(self, lname, max_length=4, german=False): code += lname[1:] code = code.replace('0', '') # rule 1 - if max_length != -1: - if len(code) < max_length: - code += '0' * (max_length - len(code)) + if self._max_length != -1: + if len(code) < self._max_length: + code += '0' * (self._max_length - len(code)) else: - code = code[:max_length] + code = code[: self._max_length] return code +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the PSHPSoundexLast.encode method instead.', +) def pshp_soundex_last(lname, max_length=4, german=False): """Calculate the PSHP Soundex/Viewex Coding of a last name. @@ -236,8 +301,10 @@ def pshp_soundex_last(lname, max_length=4, german=False): >>> pshp_soundex_last('Ashcroft') 'A225' + .. versionadded:: 0.3.0 + """ - return PSHPSoundexLast().encode(lname, max_length, german) + return PSHPSoundexLast(max_length, german).encode(lname) if __name__ == '__main__': diff --git a/abydos/phonetic/_refined_soundex.py b/abydos/phonetic/_refined_soundex.py index c6e0447e1..97c89b7c8 100644 --- a/abydos/phonetic/_refined_soundex.py +++ b/abydos/phonetic/_refined_soundex.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['RefinedSoundex', 'refined_soundex'] @@ -42,6 +45,8 @@ class RefinedSoundex(_Phonetic): This is Soundex, but with more character classes. It was defined at :cite:`Boyce:1998`. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -51,13 +56,13 @@ class RefinedSoundex(_Phonetic): ) ) - def encode(self, word, max_length=-1, zero_pad=False, retain_vowels=False): - """Return the Refined Soundex code for a word. + _alphabetic = dict(zip((ord(_) for _ in '123456789'), 'PFKGZTLNR')) + + def __init__(self, max_length=-1, zero_pad=False, retain_vowels=False): + """Initialize RefinedSoundex instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to unlimited) zero_pad : bool @@ -66,6 +71,54 @@ def encode(self, word, max_length=-1, zero_pad=False, retain_vowels=False): retain_vowels : bool Retain vowels (as 0) in the resulting code + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._zero_pad = zero_pad + self._retain_vowels = retain_vowels + + def encode_alpha(self, word): + """Return the alphabetic Refined Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Refined Soundex value + + Examples + -------- + >>> pe = RefinedSoundex() + >>> pe.encode_alpha('Christopher') + 'CRKTPR' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Schmidt') + 'SKNT' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, word): + """Return the Refined Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -75,14 +128,19 @@ def encode(self, word, max_length=-1, zero_pad=False, retain_vowels=False): -------- >>> pe = RefinedSoundex() >>> pe.encode('Christopher') - 'C393619' + 'C93619' >>> pe.encode('Niall') - 'N87' + 'N7' >>> pe.encode('Smith') - 'S386' + 'S86' >>> pe.encode('Schmidt') 'S386' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) @@ -90,19 +148,25 @@ def encode(self, word, max_length=-1, zero_pad=False, retain_vowels=False): word = ''.join(c for c in word if c in self._uc_set) # apply the Soundex algorithm - sdx = word[:1] + word.translate(self._trans) + sdx = word[:1] + word[1:].translate(self._trans) sdx = self._delete_consecutive_repeats(sdx) - if not retain_vowels: + if not self._retain_vowels: sdx = sdx.replace('0', '') # Delete vowels, H, W, Y - if max_length > 0: - if zero_pad: - sdx += '0' * max_length - sdx = sdx[:max_length] + if self._max_length > 0: + if self._zero_pad: + sdx += '0' * self._max_length + sdx = sdx[: self._max_length] return sdx +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RefinedSoundex.encode method instead.', +) def refined_soundex(word, max_length=-1, zero_pad=False, retain_vowels=False): """Return the Refined Soundex code for a word. @@ -127,16 +191,18 @@ def refined_soundex(word, max_length=-1, zero_pad=False, retain_vowels=False): Examples -------- >>> refined_soundex('Christopher') - 'C393619' + 'C93619' >>> refined_soundex('Niall') - 'N87' + 'N7' >>> refined_soundex('Smith') - 'S386' + 'S86' >>> refined_soundex('Schmidt') 'S386' + .. versionadded:: 0.3.0 + """ - return RefinedSoundex().encode(word, max_length, zero_pad, retain_vowels) + return RefinedSoundex(max_length, zero_pad, retain_vowels).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_reth_schek.py b/abydos/phonetic/_reth_schek.py index 92e56aa94..9753ab95c 100644 --- a/abydos/phonetic/_reth_schek.py +++ b/abydos/phonetic/_reth_schek.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['RethSchek', 'reth_schek_phonetik'] @@ -55,6 +58,8 @@ class RethSchek(_Phonetic): - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't think of a German word with '-tui-' in it.) - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? + + .. versionadded:: 0.3.6 """ _replacements = { @@ -139,17 +144,23 @@ def encode(self, word): Examples -------- - >>> reth_schek_phonetik('Joachim') + >>> pe = RethSchek() + >>> pe.encode('Joachim') 'JOAGHIM' - >>> reth_schek_phonetik('Christoph') + >>> pe.encode('Christoph') 'GHRISDOF' - >>> reth_schek_phonetik('Jörg') + >>> pe.encode('Jörg') 'JOERG' - >>> reth_schek_phonetik('Smith') + >>> pe.encode('Smith') 'SMID' - >>> reth_schek_phonetik('Schmidt') + >>> pe.encode('Schmidt') 'SCHMID' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # Uppercase word = word.upper() @@ -189,6 +200,12 @@ def encode(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RethSchek.encode method instead.', +) def reth_schek_phonetik(word): """Return Reth-Schek Phonetik code for a word. @@ -217,6 +234,8 @@ def reth_schek_phonetik(word): >>> reth_schek_phonetik('Schmidt') 'SCHMID' + .. versionadded:: 0.3.0 + """ return RethSchek().encode(word) diff --git a/abydos/phonetic/_roger_root.py b/abydos/phonetic/_roger_root.py index b8f375b24..488f2da12 100644 --- a/abydos/phonetic/_roger_root.py +++ b/abydos/phonetic/_roger_root.py @@ -30,10 +30,13 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['RogerRoot', 'roger_root'] @@ -42,6 +45,8 @@ class RogerRoot(_Phonetic): """Roger Root code. This is Roger Root name coding, described in :cite:`Moore:1977`. + + .. versionadded:: 0.3.6 """ # '*' is used to prevent combining by _delete_consecutive_repeats() @@ -138,19 +143,69 @@ class RogerRoot(_Phonetic): }, } - def encode(self, word, max_length=5, zero_pad=True): - """Return the Roger Root code for a word. + _alphabetic_initial = dict(zip((ord(_) for _ in '012345'), ' AHJWY')) + _alphabetic = dict(zip((ord(_) for _ in '0123456789'), 'STNMRLJKFP')) + + def __init__(self, max_length=5, zero_pad=True): + """Initialize RogerRoot instance. Parameters ---------- - word : str - The word to transform max_length : int The maximum length (default 5) of the code to return zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic Roger Root code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Roger Root code + + Examples + -------- + >>> pe = RogerRoot() + >>> pe.encode_alpha('Christopher') + 'JRST' + >>> pe.encode_alpha('Niall') + 'NL' + >>> pe.encode_alpha('Smith') + 'SMT' + >>> pe.encode_alpha('Schmidt') + 'JMT' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1].translate(self._alphabetic_initial).strip() + code[ + 1: + ].translate(self._alphabetic) + + def encode(self, word): + """Return the Roger Root code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -158,15 +213,21 @@ def encode(self, word, max_length=5, zero_pad=True): Examples -------- - >>> roger_root('Christopher') + >>> pe = RogerRoot() + >>> pe.encode('Christopher') '06401' - >>> roger_root('Niall') + >>> pe.encode('Niall') '02500' - >>> roger_root('Smith') + >>> pe.encode('Smith') '00310' - >>> roger_root('Schmidt') + >>> pe.encode('Schmidt') '06310' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) @@ -194,12 +255,18 @@ def encode(self, word, max_length=5, zero_pad=True): code = self._delete_consecutive_repeats(code) code = code.replace('*', '') - if zero_pad: - code += '0' * max_length + if self._zero_pad: + code += '0' * self._max_length - return code[:max_length] + return code[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RogerRoot.encode method instead.', +) def roger_root(word, max_length=5, zero_pad=True): """Return the Roger Root code for a word. @@ -230,8 +297,10 @@ def roger_root(word, max_length=5, zero_pad=True): >>> roger_root('Schmidt') '06310' + .. versionadded:: 0.3.0 + """ - return RogerRoot().encode(word, max_length, zero_pad) + return RogerRoot(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_russell_index.py b/abydos/phonetic/_russell_index.py index 0964224f3..73be28753 100644 --- a/abydos/phonetic/_russell_index.py +++ b/abydos/phonetic/_russell_index.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = [ 'RussellIndex', @@ -47,6 +50,9 @@ class RussellIndex(_Phonetic): This follows Robert C. Russell's Index algorithm, as described in :cite:`Russell:1917`. + + + .. versionadded:: 0.3.6 """ _uc_set = set('ABCDEFGIKLMNOPQRSTUVXYZ') @@ -86,6 +92,11 @@ def encode(self, word): >>> pe.encode('Schmidt') 3614 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') @@ -133,6 +144,11 @@ def _to_alpha(self, num): >>> pe._to_alpha(3614) 'CMAD' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ num = ''.join(c for c in text_type(num) if c in self._num_set) if num: @@ -167,12 +183,23 @@ def encode_alpha(self, word): >>> pe.encode_alpha('Schmidt') 'CMAD' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if word: return self._to_alpha(self.encode(word)) return '' +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RussellIndex.encode method instead.', +) def russell_index(word): """Return the Russell Index (integer output) of a word. @@ -199,10 +226,19 @@ def russell_index(word): >>> russell_index('Schmidt') 3614 + + .. versionadded:: 0.1.0 + """ return RussellIndex().encode(word) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RussellIndex._to_alpha method instead.', +) def russell_index_num_to_alpha(num): """Convert the Russell Index integer to an alphabetic string. @@ -227,10 +263,19 @@ def russell_index_num_to_alpha(num): >>> russell_index_num_to_alpha(3614) 'CMAD' + + .. versionadded:: 0.1.0 + """ return RussellIndex()._to_alpha(num) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the RussellIndex.encode_alpha method instead.', +) def russell_index_alpha(word): """Return the Russell Index (alphabetic output) for the word. @@ -257,6 +302,9 @@ def russell_index_alpha(word): >>> russell_index_alpha('Schmidt') 'CMAD' + + .. versionadded:: 0.1.0 + """ return RussellIndex().encode_alpha(word) diff --git a/abydos/phonetic/_sfinx_bis.py b/abydos/phonetic/_sfinx_bis.py index 200b5a1df..f03ac46b1 100644 --- a/abydos/phonetic/_sfinx_bis.py +++ b/abydos/phonetic/_sfinx_bis.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['SfinxBis', 'sfinxbis'] @@ -46,6 +49,8 @@ class SfinxBis(_Phonetic): :cite:`Sjoo:2009`. SfinxBis is intended chiefly for Swedish names. + + .. versionadded:: 0.3.6 """ _adelstitler = ( @@ -151,15 +156,67 @@ class SfinxBis(_Phonetic): ) ) - def encode(self, word, max_length=-1): + _alphabetic = dict(zip((ord(_) for _ in '123456789#'), 'PKTLNRFSAŠ')) + + def __init__(self, max_length=-1): + """Initialize SfinxBis instance. + + Parameters + ---------- + max_length : int + The length of the code returned (defaults to unlimited) + + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + + def encode_alpha(self, word): + """Return the alphabetic SfinxBis code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + tuple + The alphabetic SfinxBis value + + Examples + -------- + >>> pe = SfinxBis() + >>> pe.encode_alpha('Christopher') + ('KRSTFR',) + >>> pe.encode_alpha('Niall') + ('NL',) + >>> pe.encode_alpha('Smith') + ('SNT',) + >>> pe.encode_alpha('Schmidt') + ('SNT',) + + >>> pe.encode_alpha('Johansson') + ('JNSN',) + >>> pe.encode_alpha('Sjöberg') + ('ŠPRK',) + + + .. versionadded:: 0.4.0 + + """ + return tuple( + code.translate(self._alphabetic) for code in self.encode(word) + ) + + def encode(self, word): """Return the SfinxBis code for a word. Parameters ---------- word : str The word to transform - max_length : int - The length of the code returned (defaults to unlimited) Returns ------- @@ -183,6 +240,11 @@ def encode(self, word, max_length=-1): >>> pe.encode('Sjöberg') ('#162',) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _foersvensker(lokal_ordet): @@ -198,6 +260,8 @@ def _foersvensker(lokal_ordet): str Transformed word + .. versionadded:: 0.1.0 + """ lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN') lokal_ordet = lokal_ordet.replace('HIE', 'HJ') @@ -241,6 +305,8 @@ def _koda_foersta_ljudet(lokal_ordet): str Transformed word + .. versionadded:: 0.1.0 + """ if ( lokal_ordet[0:1] in self._mjuka_vokaler @@ -351,12 +417,18 @@ def _koda_foersta_ljudet(lokal_ordet): ] # truncate, if max_length is set - if max_length > 0: - ordlista = [ordet[:max_length] for ordet in ordlista] + if self._max_length > 0: + ordlista = [ordet[: self._max_length] for ordet in ordlista] return tuple(ordlista) +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SfinxBis.encode method instead.', +) def sfinxbis(word, max_length=-1): """Return the SfinxBis code for a word. @@ -390,8 +462,10 @@ def sfinxbis(word, max_length=-1): >>> sfinxbis('Sjöberg') ('#162',) + .. versionadded:: 0.1.0 + """ - return SfinxBis().encode(word, max_length) + return SfinxBis(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_sound_d.py b/abydos/phonetic/_sound_d.py index c4d967702..b8e051463 100644 --- a/abydos/phonetic/_sound_d.py +++ b/abydos/phonetic/_sound_d.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['SoundD', 'sound_d'] @@ -41,6 +44,8 @@ class SoundD(_Phonetic): """SoundD code. SoundD is defined in :cite:`Varol:2012`. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -50,15 +55,62 @@ class SoundD(_Phonetic): ) ) - def encode(self, word, max_length=4): + _alphabetic = dict(zip((ord(_) for _ in '0123456'), 'APKTLNR')) + + def __init__(self, max_length=4): + """Initialize SoundD instance. + + Parameters + ---------- + max_length : int + The length of the code returned (defaults to 4) + + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + + def encode_alpha(self, word): + """Return the alphabetic SoundD code. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic SoundD code + + Examples + -------- + >>> pe = SoundD() + >>> pe.encode_alpha('Gough') + 'K' + >>> pe.encode_alpha('pneuma') + 'NN' + >>> pe.encode_alpha('knight') + 'NT' + >>> pe.encode_alpha('trice') + 'TRK' + >>> pe.encode_alpha('judge') + 'KK' + + + .. versionadded:: 0.4.0 + + """ + return self.encode(word).rstrip('0').translate(self._alphabetic) + + def encode(self, word): """Return the SoundD code. Parameters ---------- word : str The word to transform - max_length : int - The length of the code returned (defaults to 4) Returns ------- @@ -67,17 +119,23 @@ def encode(self, word, max_length=4): Examples -------- - >>> sound_d('Gough') + >>> pe = SoundD() + >>> pe.encode('Gough') '2000' - >>> sound_d('pneuma') + >>> pe.encode('pneuma') '5500' - >>> sound_d('knight') + >>> pe.encode('knight') '5300' - >>> sound_d('trice') + >>> pe.encode('trice') '3620' - >>> sound_d('judge') + >>> pe.encode('judge') '2200' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') @@ -98,15 +156,21 @@ def encode(self, word, max_length=4): word = self._delete_consecutive_repeats(word) word = word.replace('0', '') - if max_length != -1: - if len(word) < max_length: - word += '0' * (max_length - len(word)) + if self._max_length != -1: + if len(word) < self._max_length: + word += '0' * (self._max_length - len(word)) else: - word = word[:max_length] + word = word[: self._max_length] return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SoundD.encode method instead.', +) def sound_d(word, max_length=4): """Return the SoundD code. @@ -135,8 +199,10 @@ def sound_d(word, max_length=4): >>> sound_d('judge') '2200' + .. versionadded:: 0.3.0 + """ - return SoundD().encode(word, max_length) + return SoundD(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_soundex.py b/abydos/phonetic/_soundex.py index 054bb4efd..7f9028a80 100644 --- a/abydos/phonetic/_soundex.py +++ b/abydos/phonetic/_soundex.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['Soundex', 'soundex'] @@ -51,6 +54,8 @@ class Soundex(_Phonetic): - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the US Census, including coding prefixed and unprefixed versions of some names + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -60,15 +65,15 @@ class Soundex(_Phonetic): ) ) - def encode( - self, word, max_length=4, var='American', reverse=False, zero_pad=True + _alphabetic = dict(zip((ord(_) for _ in '01234569'), 'APKTLNRH')) + + def __init__( + self, max_length=4, var='American', reverse=False, zero_pad=True ): - """Return the Soundex code for a word. + """Initialize Soundex instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to 4) var : str @@ -92,6 +97,60 @@ def encode( Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + # Require a max_length of at least 4 and not more than 64 + if max_length != -1: + self._max_length = min(max(4, max_length), 64) + else: + self._max_length = 64 + + self._var = var + self._reverse = reverse + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Soundex value + + Examples + -------- + >>> pe = Soundex() + >>> pe.encode_alpha("Christopher") + 'CRKT' + >>> pe.encode_alpha("Niall") + 'NL' + >>> pe.encode_alpha('Smith') + 'SNT' + >>> pe.encode_alpha('Schmidt') + 'SNT' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, word): + """Return the Soundex code for a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -109,47 +168,68 @@ def encode( >>> pe.encode('Schmidt') 'S530' - >>> pe.encode('Christopher', max_length=-1) + >>> Soundex(max_length=-1).encode('Christopher') 'C623160000000000000000000000000000000000000000000000000000000000' - >>> pe.encode('Christopher', max_length=-1, zero_pad=False) + >>> Soundex(max_length=-1, zero_pad=False).encode('Christopher') 'C62316' - >>> pe.encode('Christopher', reverse=True) + >>> Soundex(reverse=True).encode('Christopher') 'R132' >>> pe.encode('Ashcroft') 'A261' >>> pe.encode('Asicroft') 'A226' - >>> pe.encode('Ashcroft', var='special') + + >>> pe_special = Soundex(var='special') + >>> pe_special.encode('Ashcroft') 'A226' - >>> pe.encode('Asicroft', var='special') + >>> pe_special.encode('Asicroft') 'A226' - """ - # Require a max_length of at least 4 and not more than 64 - if max_length != -1: - max_length = min(max(4, max_length), 64) - else: - max_length = 64 + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') - if var == 'Census': + if self._var == 'Census': if word[:3] in {'VAN', 'CON'} and len(word) > 4: return ( - soundex(word, max_length, 'American', reverse, zero_pad), soundex( - word[3:], max_length, 'American', reverse, zero_pad + word, + self._max_length, + 'American', + self._reverse, + self._zero_pad, + ), + soundex( + word[3:], + self._max_length, + 'American', + self._reverse, + self._zero_pad, ), ) if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3: return ( - soundex(word, max_length, 'American', reverse, zero_pad), soundex( - word[2:], max_length, 'American', reverse, zero_pad + word, + self._max_length, + 'American', + self._reverse, + self._zero_pad, + ), + soundex( + word[2:], + self._max_length, + 'American', + self._reverse, + self._zero_pad, ), ) # Otherwise, proceed as usual (var='American' mode, ostensibly) @@ -158,18 +238,18 @@ def encode( # Nothing to convert, return base case if not word: - if zero_pad: - return '0' * max_length + if self._zero_pad: + return '0' * self._max_length return '0' # Reverse word if computing Reverse Soundex - if reverse: + if self._reverse: word = word[::-1] # apply the Soundex algorithm sdx = word.translate(self._trans) - if var == 'special': + if self._var == 'special': sdx = sdx.replace('9', '0') # special rule for 1880-1910 census else: sdx = sdx.replace('9', '') # rule 1 @@ -181,12 +261,18 @@ def encode( sdx = word[0] + sdx[1:] sdx = sdx.replace('0', '') # rule 1 - if zero_pad: - sdx += '0' * max_length # rule 4 + if self._zero_pad: + sdx += '0' * self._max_length # rule 4 - return sdx[:max_length] + return sdx[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Soundex.encode method instead.', +) def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True): """Return the Soundex code for a word. @@ -251,8 +337,10 @@ def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True): >>> soundex('Asicroft', var='special') 'A226' + .. versionadded:: 0.1.0 + """ - return Soundex().encode(word, max_length, var, reverse, zero_pad) + return Soundex(max_length, var, reverse, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_soundex_br.py b/abydos/phonetic/_soundex_br.py index 81a23fb1f..f2b74e46f 100644 --- a/abydos/phonetic/_soundex_br.py +++ b/abydos/phonetic/_soundex_br.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['SoundexBR', 'soundex_br'] @@ -41,6 +44,8 @@ class SoundexBR(_Phonetic): """SoundexBR. This is based on :cite:`Marcelino:2015`. + + .. versionadded:: 0.3.6 """ _trans = dict( @@ -50,19 +55,70 @@ class SoundexBR(_Phonetic): ) ) - def encode(self, word, max_length=4, zero_pad=True): - """Return the SoundexBR encoding of a word. + _alphabetic = dict(zip((ord(_) for _ in '0123456'), 'APKTLNR')) + + def __init__(self, max_length=4, zero_pad=True): + """Initialize SoundexBR instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to 4) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._zero_pad = zero_pad + + def encode_alpha(self, word): + """Return the alphabetic SoundexBR encoding of a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic SoundexBR code + + Examples + -------- + >>> pe = SoundexBR() + >>> pe.encode_alpha('Oliveira') + 'OLPR' + >>> pe.encode_alpha('Almeida') + 'ALNT' + >>> pe.encode_alpha('Barbosa') + 'BRPK' + >>> pe.encode_alpha('Araújo') + 'ARK' + >>> pe.encode_alpha('Gonçalves') + 'GNKL' + >>> pe.encode_alpha('Goncalves') + 'GNKL' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word).rstrip('0') + return code[:1] + code[1:].translate(self._alphabetic) + + def encode(self, word): + """Return the SoundexBR encoding of a word. + + Parameters + ---------- + word : str + The word to transform + Returns ------- str @@ -70,19 +126,25 @@ def encode(self, word, max_length=4, zero_pad=True): Examples -------- - >>> soundex_br('Oliveira') + >>> pe = SoundexBR() + >>> pe.encode('Oliveira') 'O416' - >>> soundex_br('Almeida') + >>> pe.encode('Almeida') 'A453' - >>> soundex_br('Barbosa') + >>> pe.encode('Barbosa') 'B612' - >>> soundex_br('Araújo') + >>> pe.encode('Araújo') 'A620' - >>> soundex_br('Gonçalves') + >>> pe.encode('Gonçalves') 'G524' - >>> soundex_br('Goncalves') + >>> pe.encode('Goncalves') 'G524' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._uc_set) @@ -107,12 +169,18 @@ def encode(self, word, max_length=4, zero_pad=True): sdx = self._delete_consecutive_repeats(sdx) sdx = sdx.replace('0', '') - if zero_pad: - sdx += '0' * max_length + if self._zero_pad: + sdx += '0' * self._max_length - return sdx[:max_length] + return sdx[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SoundexBR.encode method instead.', +) def soundex_br(word, max_length=4, zero_pad=True): """Return the SoundexBR encoding of a word. @@ -147,8 +215,10 @@ def soundex_br(word, max_length=4, zero_pad=True): >>> soundex_br('Goncalves') 'G524' + .. versionadded:: 0.3.0 + """ - return SoundexBR().encode(word, max_length, zero_pad) + return SoundexBR(max_length, zero_pad).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_spanish_metaphone.py b/abydos/phonetic/_spanish_metaphone.py index b7cb644bb..9d22b6e4b 100644 --- a/abydos/phonetic/_spanish_metaphone.py +++ b/abydos/phonetic/_spanish_metaphone.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['SpanishMetaphone', 'spanish_metaphone'] @@ -45,21 +48,38 @@ class SpanishMetaphone(_Phonetic): :cite:`Mosquera:2012`. Modified version based on :cite:`delPilarAngeles:2016`. + + + .. versionadded:: 0.3.6 """ - def encode(self, word, max_length=6, modified=False): - """Return the Spanish Metaphone of a word. + def __init__(self, max_length=6, modified=False): + """Initialize AlphaSIS instance. Parameters ---------- - word : str - The word to transform max_length : int The length of the code returned (defaults to 6) modified : bool Set to True to use del Pilar Angeles & Bailón-Miguel's modified version of the algorithm + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + self._modified = modified + + def encode(self, word): + """Return the Spanish Metaphone of a word. + + Parameters + ---------- + word : str + The word to transform + + Returns ------- str @@ -79,6 +99,12 @@ def encode(self, word, max_length=6, modified=False): >>> pe.encode('Nicolás') 'NKLS' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + + """ def _is_vowel(pos): @@ -94,6 +120,8 @@ def _is_vowel(pos): bool True if word[pos] is a vowel + .. versionadded:: 0.3.0 + """ return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'} @@ -103,7 +131,7 @@ def _is_vowel(pos): pos = 0 # do some replacements for the modified version - if modified: + if self._modified: word = word.replace('MB', 'NB') word = word.replace('MP', 'NP') word = word.replace('BS', 'S') @@ -124,7 +152,7 @@ def _is_vowel(pos): word = word.replace('B', 'V') word = word.replace('LL', 'Y') - while len(meta_key) < max_length: + while len(meta_key) < self._max_length: if pos >= len(word): break @@ -226,12 +254,18 @@ def _is_vowel(pos): pos += 1 # Final change from S to Z in modified version - if modified: + if self._modified: meta_key = meta_key.replace('S', 'Z') return meta_key +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SpanishMetaphone.encode method instead.', +) def spanish_metaphone(word, max_length=6, modified=False): """Return the Spanish Metaphone of a word. @@ -265,8 +299,11 @@ def spanish_metaphone(word, max_length=6, modified=False): >>> spanish_metaphone('Nicolás') 'NKLS' + + .. versionadded:: 0.3.0 + """ - return SpanishMetaphone().encode(word, max_length, modified) + return SpanishMetaphone(max_length, modified).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_spfc.py b/abydos/phonetic/_spfc.py index 64706f003..29b0a0c97 100644 --- a/abydos/phonetic/_spfc.py +++ b/abydos/phonetic/_spfc.py @@ -30,10 +30,13 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['SPFC', 'spfc'] @@ -43,6 +46,8 @@ class SPFC(_Phonetic): Standardized Phonetic Frequency Code is roughly Soundex-like. This implementation is based on page 19-21 of :cite:`Moore:1977`. + + .. versionadded:: 0.3.6 """ _pf1 = dict( @@ -72,6 +77,57 @@ class SPFC(_Phonetic): ('MN', 'N'), ) + _pf1_alphabetic = dict(zip((ord(_) for _ in '01234567'), 'SCFALDEG')) + _pf2_alphabetic = dict(zip((ord(_) for _ in '0123456789'), 'SCFAODMGUE')) + _pf3_alphabetic = dict(zip((ord(_) for _ in '01234567'), 'BDFGMRSZ')) + + def encode_alpha(self, word): + """Return the alphabetic SPFC of a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic SPFC value + + Examples + -------- + >>> pe = SPFC() + >>> pe.encode_alpha('Christopher Smith') + 'SDCMS' + >>> pe.encode_alpha('Christopher Schmidt') + 'SDCMS' + >>> pe.encode_alpha('Niall Smith') + 'SDMMS' + >>> pe.encode_alpha('Niall Schmidt') + 'SDMMS' + + >>> pe.encode_alpha('L.Smith') + 'SDEMS' + >>> pe.encode_alpha('R.Miller') + 'EROES' + + >>> pe.encode_alpha(('L', 'Smith')) + 'SDEMS' + >>> pe.encode_alpha(('R', 'Miller')) + 'EROES' + + + .. versionadded:: 0.4.0 + + """ + code = self.encode(word) + + return ( + code[:1].translate(self._pf1_alphabetic) + + code[1:2].translate(self._pf3_alphabetic) + + code[2:].translate(self._pf2_alphabetic) + ) + def encode(self, word): """Return the Standardized Phonetic Frequency Code (SPFC) of a word. @@ -114,6 +170,11 @@ def encode(self, word): >>> pe.encode(('R', 'Miller')) '65490' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _raise_word_ex(): @@ -126,6 +187,8 @@ def _raise_word_ex(): the first and last names or a tuple/list consisting of the first and last names + .. versionadded:: 0.1.0 + """ raise AttributeError( 'Word attribute must be a string with a space or period ' @@ -171,6 +234,8 @@ def _steps_one_to_three(name): str Transformed name + .. versionadded:: 0.1.0 + """ # filter out non A-Z name = ''.join(_ for _ in name if _ in self._uc_set) @@ -206,22 +271,10 @@ def _steps_one_to_three(name): # second digit of the code. Use as many letters as possible and remove # after coding. if names[1]: - if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS': - code += '8' - names[1] = names[1][:-3] - elif names[1][-2:] == 'SN': - code += '8' - names[1] = names[1][:-2] - elif names[1][-3:] == 'STR': - code += '9' - names[1] = names[1][:-3] - elif names[1][-2:] in {'SR', 'TN', 'TD'}: - code += '9' - names[1] = names[1][:-2] - elif names[1][-3:] == 'DRS': + if names[1][-3:] in {'DRS', 'STN', 'PRS', 'STR'}: code += '7' names[1] = names[1][:-3] - elif names[1][-2:] in {'TR', 'MN'}: + elif names[1][-2:] in {'MN', 'TR', 'SN', 'SR', 'TN', 'TD'}: code += '7' names[1] = names[1][:-2] else: @@ -249,6 +302,12 @@ def _steps_one_to_three(name): return code +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SPFC.encode method instead.', +) def spfc(word): """Return the Standardized Phonetic Frequency Code (SPFC) of a word. @@ -285,6 +344,8 @@ def spfc(word): >>> spfc(('R', 'Miller')) '65490' + .. versionadded:: 0.1.0 + """ return SPFC().encode(word) diff --git a/abydos/phonetic/_statistics_canada.py b/abydos/phonetic/_statistics_canada.py index 72369111a..91b0750c0 100644 --- a/abydos/phonetic/_statistics_canada.py +++ b/abydos/phonetic/_statistics_canada.py @@ -30,9 +30,12 @@ from unicodedata import normalize as unicode_normalize +from deprecation import deprecated + from six import text_type from ._phonetic import _Phonetic +from .. import __version__ __all__ = ['StatisticsCanada', 'statistics_canada'] @@ -48,17 +51,31 @@ class StatisticsCanada(_Phonetic): The modified version of this algorithm is described in Appendix B of :cite:`Moore:1977`. + + .. versionadded:: 0.3.6 """ - def encode(self, word, max_length=4): + def __init__(self, max_length=4): + """Initialize StatisticsCanada instance. + + Parameters + ---------- + max_length : int + The length of the code returned (defaults to 4) + + + .. versionadded:: 0.4.0 + + """ + self._max_length = max_length + + def encode(self, word): """Return the Statistics Canada code for a word. Parameters ---------- word : str The word to transform - max_length : int - The maximum length (default 4) of the code to return Returns ------- @@ -77,6 +94,11 @@ def encode(self, word, max_length=4): >>> pe.encode('Schmidt') 'SCHM' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) @@ -92,9 +114,15 @@ def encode(self, word, max_length=4): code = self._delete_consecutive_repeats(code) code = code.replace(' ', '') - return code[:max_length] + return code[: self._max_length] +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the StatisticsCanada.encode method instead.', +) def statistics_canada(word, max_length=4): """Return the Statistics Canada code for a word. @@ -123,8 +151,10 @@ def statistics_canada(word, max_length=4): >>> statistics_canada('Schmidt') 'SCHM' + .. versionadded:: 0.3.0 + """ - return StatisticsCanada().encode(word, max_length) + return StatisticsCanada(max_length).encode(word) if __name__ == '__main__': diff --git a/abydos/phonetic/_waahlin.py b/abydos/phonetic/_waahlin.py new file mode 100644 index 000000000..15b83fa55 --- /dev/null +++ b/abydos/phonetic/_waahlin.py @@ -0,0 +1,232 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.phonetic._waahlin. + +Wåhlin phonetic encoding +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from unicodedata import normalize as unicode_normalize + +from six import text_type + +from ._phonetic import _Phonetic + +__all__ = ['Waahlin'] + + +class Waahlin(_Phonetic): + """Wåhlin code. + + Wåhlin's first-letter coding is based on the description in + :cite:`Erikson:1997`. + + .. versionadded:: 0.3.6 + """ + + def __init__(self, encoder=None): + """Initialize Waahlin instance. + + Parameters + ---------- + encoder : _Phonetic + An initialized phonetic algorithm object + + + .. versionadded:: 0.4.0 + + """ + self._encoder = encoder + + _transforms = { + 3: {'SCH': '*', 'STJ': '*', 'SKJ': '*'}, + 2: { + 'AE': 'E', + 'CH': 'K', + 'DJ': 'J', + 'GJ': 'J', + 'HJ': 'J', + 'HV': 'V', + 'HW': 'V', + 'HR': 'R', + 'KJ': '+', + 'LJ': 'J', + 'PH': 'F', + 'QU': 'KV', + 'SJ': '*', + 'TJ': '+', + }, + 1: {'Q': 'K', 'W': 'V', 'Z': 'S', 'Ä': 'E'}, + } + + def _encode_next(self, word): + if word[:3] == 'STI' and word[3:4] in {'E', 'Ä'}: + code = '*' + remainder = word[3:] + elif word[:3] in self._transforms[3]: + code = self._transforms[3][word[:3]] + remainder = word[3:] + elif word[:2] == 'HI' and word[2:3] in { + 'A', + 'E', + 'I', + 'O', + 'U', + 'Y', + 'Å', + 'Ä', + 'Ö', + }: + code = 'J' + remainder = word[2:] + elif word[:2] == 'SK' and word[2:3] in {'E', 'I', 'Y', 'Ä', 'Ö'}: + code = '*' + remainder = word[2:] + elif word[:2] in self._transforms[2]: + code = self._transforms[2][word[:2]] + remainder = word[2:] + elif word[:1] == 'C' and word[1:2] in {'E', 'I', 'Y', 'Ä'}: + code = 'S' + remainder = word[1:] + elif word[:1] == 'G' and word[1:2] in {'E', 'I', 'Y', 'Ä', 'Ö'}: + code = 'J' + remainder = word[1:] + elif word[:1] == 'I' and word[1:2] in { + 'A', + 'E', + 'I', + 'O', + 'U', + 'Y', + 'Å', + 'Ä', + 'Ö', + }: + code = 'J' + remainder = word[1:] + elif word[:1] == 'K' and word[1:2] in {'E', 'I', 'Y', 'Ä', 'Ö'}: + code = '+' + remainder = word[1:] + elif word[:1] in self._transforms[1]: + code = self._transforms[1][word[:1]] + remainder = word[1:] + else: + code = word[:1] + remainder = word[1:] + + return code, remainder + + def encode_alpha(self, word): + """Return the alphabetic Wåhlin code for a word. + + Parameters + ---------- + word : str + The word to transform + + Returns + ------- + str + The alphabetic Wåhlin code value + + Examples + -------- + >>> pe = Waahlin() + >>> pe.encode_alpha('Christopher') + 'KRISTOFER' + >>> pe.encode_alpha('Niall') + 'NJALL' + >>> pe.encode_alpha('Smith') + 'SMITH' + >>> pe.encode_alpha('Schmidt') + 'ŠMIDT' + + + .. versionadded:: 0.4.0 + + """ + return ( + self.encode(word, alphabetic=True) + .replace('+', 'Ç') + .replace('*', 'Š') + ) + + def encode(self, word, alphabetic=False): + """Return the Wåhlin code for a word. + + Parameters + ---------- + word : str + The word to transform + alphabetic : bool + If True, the encoder will apply its alphabetic form (.encode_alpha + rather than .encode) + + Returns + ------- + str + The Wåhlin code value + + Examples + -------- + >>> pe = Waahlin() + >>> pe.encode('Christopher') + 'KRISTOFER' + >>> pe.encode('Niall') + 'NJALL' + >>> pe.encode('Smith') + 'SMITH' + >>> pe.encode('Schmidt') + '*MIDT' + + + .. versionadded:: 0.4.0 + + """ + # uppercase, normalize, decompose, and filter non-A-Z out + word = unicode_normalize('NFC', text_type(word.upper())) + word = word.replace('ß', 'SS') + if not word: + return '' + + if self._encoder is None: + code = '' + while word: + part, word = self._encode_next(word) + code += part + return code + + code, word = self._encode_next(word) + return code + ( + self._encoder.encode_alpha(word) + if alphabetic + else self._encoder.encode(word) + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/stats/__init__.py b/abydos/stats/__init__.py index 0b2acec63..31d255f19 100644 --- a/abydos/stats/__init__.py +++ b/abydos/stats/__init__.py @@ -96,8 +96,8 @@ - :py:meth:`.false_neg` returns the number of false negatives - :py:meth:`.correct_pop` returns the correct population - :py:meth:`.error_pop` returns the error population - - :py:meth:`.test_pos_pop` returns the test positive population - - :py:meth:`.test_neg_pop` returns the test negative population + - :py:meth:`.pred_pos_pop` returns the test positive population + - :py:meth:`.pred_neg_pop` returns the test negative population - :py:meth:`.cond_pos_pop` returns the condition positive population - :py:meth:`.cond_neg_pop` returns the condition negative population - :py:meth:`.population` returns the total population @@ -147,7 +147,7 @@ >>> ct = ConfusionTable(120, 60, 20, 30) >>> ct.f1_score() -0.8275862068965516 +0.8275862068965518 >>> ct.mcc() 0.5367450401216932 >>> ct.specificity() diff --git a/abydos/stats/_confusion_table.py b/abydos/stats/_confusion_table.py index 696f37f4c..b10dd6408 100644 --- a/abydos/stats/_confusion_table.py +++ b/abydos/stats/_confusion_table.py @@ -43,6 +43,8 @@ import math +from deprecation import deprecated + from ._mean import ( aghmean, agmean, @@ -58,6 +60,7 @@ qmean, seiffert_mean, ) +from .. import __version__ __all__ = ['ConfusionTable'] @@ -106,6 +109,9 @@ def __init__(self, tp=0, tn=0, fp=0, fn=0): >>> ct == ConfusionTable({'tp': 120, 'tn': 60, 'fp': 20, 'fn': 30}) True + + .. versionadded:: 0.1.0 + """ if isinstance(tp, (tuple, list)): if len(tp) == 4: @@ -166,6 +172,9 @@ def __eq__(self, other): >>> ct1 != ct3 True + + .. versionadded:: 0.1.0 + """ if isinstance(other, ConfusionTable): if id(self) == id(other): @@ -209,11 +218,37 @@ def __str__(self): >>> str(ct) 'tp:120, tn:60, fp:20, fn:30' + + .. versionadded:: 0.1.0 + """ return 'tp:{}, tn:{}, fp:{}, fn:{}'.format( self._tp, self._tn, self._fp, self._fn ) + def __repr__(self): + """Return representation. + + Returns + ------- + str + A string representation of the ConfusionTable that can be used to + recreate it + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> repr(ct) + 'ConfusionTable(tp=120, tn=60, fp=20, fn=30)' + + + .. versionadded:: 0.4.0 + + """ + return 'ConfusionTable(tp={}, tn={}, fp={}, fn={})'.format( + self._tp, self._tn, self._fp, self._fn + ) + def to_tuple(self): """Cast to tuple. @@ -228,6 +263,9 @@ def to_tuple(self): >>> ct.to_tuple() (120, 60, 20, 30) + + .. versionadded:: 0.1.0 + """ return self._tp, self._tn, self._fp, self._fn @@ -246,6 +284,9 @@ def to_dict(self): >>> pprint.pprint(ct.to_dict()) {'fn': 30, 'fp': 20, 'tn': 60, 'tp': 120} + + .. versionadded:: 0.1.0 + """ return {'tp': self._tp, 'tn': self._tn, 'fp': self._fp, 'fn': self._fn} @@ -263,6 +304,9 @@ def true_pos(self): >>> ct.true_pos() 120 + + .. versionadded:: 0.1.0 + """ return self._tp @@ -280,12 +324,17 @@ def true_neg(self): >>> ct.true_neg() 60 + + .. versionadded:: 0.1.0 + """ return self._tn def false_pos(self): """Return false positives. + AKA Type I error + Returns ------- int @@ -297,12 +346,17 @@ def false_pos(self): >>> ct.false_pos() 20 + + .. versionadded:: 0.1.0 + """ return self._fp def false_neg(self): """Return false negatives. + AKA Type II error + Returns ------- int @@ -314,6 +368,9 @@ def false_neg(self): >>> ct.false_neg() 30 + + .. versionadded:: 0.1.0 + """ return self._fn @@ -331,6 +388,9 @@ def correct_pop(self): >>> ct.correct_pop() 180 + + .. versionadded:: 0.1.0 + """ return self._tp + self._tn @@ -348,40 +408,59 @@ def error_pop(self): >>> ct.error_pop() 50 + + .. versionadded:: 0.1.0 + """ return self._fp + self._fn - def test_pos_pop(self): - """Return test positive population. + def pred_pos_pop(self): + """Return predicted positive population. Returns ------- int - The test positive population of the confusion table + The predicted positive population of the confusion table Example ------- >>> ct = ConfusionTable(120, 60, 20, 30) - >>> ct.test_pos_pop() + >>> ct.pred_pos_pop() 140 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.4.0 + renamed from test_pos_pop + + + .. versionadded:: 0.1.0 + """ return self._tp + self._fp - def test_neg_pop(self): - """Return test negative population. + def pred_neg_pop(self): + """Return predicted negative population. Returns ------- int - The test negative population of the confusion table + The predicted negative population of the confusion table Example ------- >>> ct = ConfusionTable(120, 60, 20, 30) - >>> ct.test_neg_pop() + >>> ct.pred_neg_pop() 90 + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.4.0 + renamed from test_neg_pop + + + .. versionadded:: 0.1.0 + """ return self._tn + self._fn @@ -399,6 +478,9 @@ def cond_pos_pop(self): >>> ct.cond_pos_pop() 150 + + .. versionadded:: 0.1.0 + """ return self._tp + self._fn @@ -416,6 +498,9 @@ def cond_neg_pop(self): >>> ct.cond_neg_pop() 80 + + .. versionadded:: 0.1.0 + """ return self._fp + self._tn @@ -433,13 +518,20 @@ def population(self): >>> ct.population() 230 + + .. versionadded:: 0.1.0 + """ return self._tp + self._tn + self._fp + self._fn def precision(self): r"""Return precision. - Precision is defined as :math:`\frac{tp}{tp + fp}` + Precision is defined as + + .. math:: + + \frac{tp}{tp + fp} AKA positive predictive value (PPV) @@ -458,16 +550,23 @@ def precision(self): >>> ct.precision() 0.8571428571428571 + + .. versionadded:: 0.1.0 + """ - if self._tp + self._fp == 0: - return float('NaN') - return self._tp / (self._tp + self._fp) + try: + return self._tp / (self._tp + self._fp) + except ZeroDivisionError: + return float('nan') def precision_gain(self): r"""Return gain in precision. - The gain in precision is defined as: - :math:`G(precision) = \frac{precision}{random~ precision}` + The gain in precision is defined as + + .. math:: + + G(precision) = \frac{precision}{random~ precision} Cf. https://en.wikipedia.org/wiki/Gain_(information_retrieval) @@ -482,16 +581,24 @@ def precision_gain(self): >>> ct.precision_gain() 1.3142857142857143 + + .. versionadded:: 0.1.0 + """ - if self.population() == 0: - return float('NaN') - random_precision = self.cond_pos_pop() / self.population() - return self.precision() / random_precision + try: + random_precision = self.cond_pos_pop() / self.population() + return self.precision() / random_precision + except ZeroDivisionError: + return float('nan') def recall(self): r"""Return recall. - Recall is defined as :math:`\frac{tp}{tp + fn}` + Recall is defined as + + .. math:: + + \frac{tp}{tp + fn} AKA sensitivity @@ -514,18 +621,28 @@ def recall(self): >>> ct.recall() 0.8 + + .. versionadded:: 0.1.0 + """ - if self._tp + self._fn == 0: - return float('NaN') - return self._tp / (self._tp + self._fn) + try: + return self._tp / (self._tp + self._fn) + except ZeroDivisionError: + return float('nan') def specificity(self): r"""Return specificity. - Specificity is defined as :math:`\frac{tn}{tn + fp}` + Specificity is defined as + + .. math:: + + \frac{tn}{tn + fp} AKA true negative rate (TNR) + AKA inverse recall + Cf. https://en.wikipedia.org/wiki/Specificity_(tests) Returns @@ -539,15 +656,55 @@ def specificity(self): >>> ct.specificity() 0.75 + + .. versionadded:: 0.1.0 + """ - if self._tn + self._fp == 0: - return float('NaN') - return self._tn / (self._tn + self._fp) + try: + return self._tn / (self._tn + self._fp) + except ZeroDivisionError: + return float('nan') + + def fnr(self): + r"""Return false negative rate. + + False negative rate is defined as + + .. math:: + + \frac{fn}{tp + fn} + + AKA miss rate + + Cf. https://en.wikipedia.org/wiki/False_negative_rate + + Returns + ------- + float + The false negative rate of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> round(ct.fnr(), 8) + 0.2 + + + .. versionadded:: 0.4.0 + + """ + return 1 - self.recall() def npv(self): r"""Return negative predictive value (NPV). - NPV is defined as :math:`\frac{tn}{tn + fn}` + NPV is defined as + + .. math:: + + \frac{tn}{tn + fn} + + AKA inverse precision Cf. https://en.wikipedia.org/wiki/Negative_predictive_value @@ -562,15 +719,54 @@ def npv(self): >>> ct.npv() 0.6666666666666666 + + .. versionadded:: 0.1.0 + """ - if self._tn + self._fn == 0: - return float('NaN') - return self._tn / (self._tn + self._fn) + try: + return self._tn / (self._tn + self._fn) + except ZeroDivisionError: + return float('nan') + + def false_omission_rate(self): + r"""Return false omission rate (FOR). + + FOR is defined as + + .. math:: + + \frac{fn}{tn + fn} + + Cf. https://en.wikipedia.org/wiki/False_omission_rate + + Returns + ------- + float + The false omission rate of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.false_omission_rate() + 0.3333333333333333 + + + .. versionadded:: 0.4.0 + + """ + try: + return self._fn / (self._tn + self._fn) + except ZeroDivisionError: + return float('nan') def fallout(self): r"""Return fall-out. - Fall-out is defined as :math:`\frac{fp}{fp + tn}` + Fall-out is defined as + + .. math:: + + \frac{fp}{fp + tn} AKA false positive rate (FPR) @@ -587,15 +783,110 @@ def fallout(self): >>> ct.fallout() 0.25 + + .. versionadded:: 0.1.0 + """ - if self._fp + self._tn == 0: - return float('NaN') - return self._fp / (self._fp + self._tn) + return 1 - self.specificity() + + def pos_likelihood_ratio(self): + r"""Return positive likelihood ratio. + + Positive likelihood ratio is defined as + + .. math:: + + \frac{recall}{1-specificity} + + Cf. + https://en.wikipedia.org/wiki/Likelihood_ratios_in_diagnostic_testing + + Returns + ------- + float + The positive likelihood ratio of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.pos_likelihood_ratio() + 3.2 + + + .. versionadded:: 0.4.0 + + """ + return self.recall() / (1.0 - self.specificity()) + + def neg_likelihood_ratio(self): + r"""Return negative likelihood ratio. + + Negative likelihood ratio is defined as + + .. math:: + + \frac{1-recall}{specificity} + + Cf. + https://en.wikipedia.org/wiki/Likelihood_ratios_in_diagnostic_testing + + Returns + ------- + float + The negative likelihood ratio of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.neg_likelihood_ratio() + 0.2666666666666666 + + + .. versionadded:: 0.4.0 + + """ + return (1.0 - self.recall()) / self.specificity() + + def diagnostic_odds_ratio(self): + r"""Return diagnostic odds ratio. + + Diagnostic odds ratio is defined as + + .. math:: + + \frac{tp \cdot tn}{fp \cdot fn} + + Cf. + https://en.wikipedia.org/wiki/Diagnostic_odds_ratio + + Returns + ------- + float + The negative likelihood ratio of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.diagnostic_odds_ratio() + 12.0 + + + .. versionadded:: 0.4.0 + + """ + try: + return (self._tp * self._tn) / (self._fp * self._fn) + except ZeroDivisionError: + return float('nan') def fdr(self): r"""Return false discovery rate (FDR). - False discovery rate is defined as :math:`\frac{fp}{fp + tp}` + False discovery rate is defined as + + .. math:: + + \frac{fp}{fp + tp} Cf. https://en.wikipedia.org/wiki/False_discovery_rate @@ -610,15 +901,23 @@ def fdr(self): >>> ct.fdr() 0.14285714285714285 + + .. versionadded:: 0.1.0 + """ - if self._fp + self._tp == 0: - return float('NaN') - return self._fp / (self._fp + self._tp) + try: + return self._fp / (self._fp + self._tp) + except ZeroDivisionError: + return float('nan') def accuracy(self): r"""Return accuracy. - Accuracy is defined as :math:`\frac{tp + tn}{population}` + Accuracy is defined as + + .. math:: + + \frac{tp + tn}{population} Cf. https://en.wikipedia.org/wiki/Accuracy @@ -633,16 +932,23 @@ def accuracy(self): >>> ct.accuracy() 0.782608695652174 + + .. versionadded:: 0.1.0 + """ - if self.population() == 0: - return float('NaN') - return (self._tp + self._tn) / self.population() + try: + return (self._tp + self._tn) / self.population() + except ZeroDivisionError: + return float('nan') def accuracy_gain(self): r"""Return gain in accuracy. - The gain in accuracy is defined as: - :math:`G(accuracy) = \frac{accuracy}{random~ accuracy}` + The gain in accuracy is defined as + + .. math:: + + G(accuracy) = \frac{accuracy}{random~ accuracy} Cf. https://en.wikipedia.org/wiki/Gain_(information_retrieval) @@ -657,19 +963,26 @@ def accuracy_gain(self): >>> ct.accuracy_gain() 1.4325259515570934 + + .. versionadded:: 0.1.0 + """ - if self.population() == 0: - return float('NaN') - random_accuracy = (self.cond_pos_pop() / self.population()) ** 2 + ( - self.cond_neg_pop() / self.population() - ) ** 2 - return self.accuracy() / random_accuracy + try: + random_accuracy = ( + self.cond_pos_pop() / self.population() + ) ** 2 + (self.cond_neg_pop() / self.population()) ** 2 + return self.accuracy() / random_accuracy + except ZeroDivisionError: + return float('nan') def balanced_accuracy(self): r"""Return balanced accuracy. Balanced accuracy is defined as - :math:`\frac{sensitivity + specificity}{2}` + + .. math:: + + \frac{sensitivity + specificity}{2} Cf. https://en.wikipedia.org/wiki/Accuracy @@ -684,13 +997,82 @@ def balanced_accuracy(self): >>> ct.balanced_accuracy() 0.775 + + .. versionadded:: 0.1.0 + """ return 0.5 * (self.recall() + self.specificity()) + def error_rate(self): + r"""Return error rate. + + Error rate is defined as + + .. math:: + + \frac{fp + fn}{population} + + Returns + ------- + float + The error rate of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.error_rate() + 0.21739130434782608 + + + .. versionadded:: 0.4.0 + + """ + try: + return (self._fn + self._fp) / ( + self._fn + self._fp + self._tn + self._tp + ) + except ZeroDivisionError: + return float('nan') + + def prevalence(self): + r"""Return prevalence. + + Prevalence is defined as + + .. math:: + + \frac{condition positive}{population} + + Cf. https://en.wikipedia.org/wiki/Prevalence + + Returns + ------- + float + The prevelence of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.prevalence() + 0.6521739130434783 + + + .. versionadded:: 0.4.0 + + """ + try: + return self.cond_pos_pop() / self.population() + except ZeroDivisionError: + return float('nan') + def informedness(self): """Return informedness. - Informedness is defined as :math:`sensitivity + specificity - 1`. + Informedness is defined as + + .. math:: + + sensitivity + specificity - 1 AKA Youden's J statistic (:cite:`Youden:1950`) @@ -709,13 +1091,22 @@ def informedness(self): >>> ct.informedness() 0.55 + + .. versionadded:: 0.1.0 + """ return self.recall() + self.specificity() - 1 def markedness(self): """Return markedness. - Markedness is defined as :math:`precision + npv - 1` + Markedness is defined as + + .. math:: + + precision + npv - 1 + + AKA DeltaP Returns ------- @@ -728,14 +1119,20 @@ def markedness(self): >>> ct.markedness() 0.5238095238095237 + + .. versionadded:: 0.1.0 + """ return self.precision() + self.npv() - 1 def pr_amean(self): r"""Return arithmetic mean of precision & recall. - The arithmetic mean of precision and recall is defined as: - :math:`\frac{precision \cdot recall}{2}` + The arithmetic mean of precision and recall is defined as + + .. math:: + + \frac{precision \cdot recall}{2} Cf. https://en.wikipedia.org/wiki/Arithmetic_mean @@ -750,6 +1147,9 @@ def pr_amean(self): >>> ct.pr_amean() 0.8285714285714285 + + .. versionadded:: 0.1.0 + """ return amean((self.precision(), self.recall())) @@ -757,7 +1157,10 @@ def pr_gmean(self): r"""Return geometric mean of precision & recall. The geometric mean of precision and recall is defined as: - :math:`\sqrt{precision \cdot recall}` + + .. math:: + + \sqrt{precision \cdot recall} Cf. https://en.wikipedia.org/wiki/Geometric_mean @@ -772,14 +1175,20 @@ def pr_gmean(self): >>> ct.pr_gmean() 0.828078671210825 + + .. versionadded:: 0.1.0 + """ return gmean((self.precision(), self.recall())) def pr_hmean(self): r"""Return harmonic mean of precision & recall. - The harmonic mean of precision and recall is defined as: - :math:`\frac{2 \cdot precision \cdot recall}{precision + recall}` + The harmonic mean of precision and recall is defined as + + .. math:: + + \frac{2 \cdot precision \cdot recall}{precision + recall} Cf. https://en.wikipedia.org/wiki/Harmonic_mean @@ -794,14 +1203,20 @@ def pr_hmean(self): >>> ct.pr_hmean() 0.8275862068965516 + + .. versionadded:: 0.1.0 + """ return hmean((self.precision(), self.recall())) def pr_qmean(self): r"""Return quadratic mean of precision & recall. - The quadratic mean of precision and recall is defined as: - :math:`\sqrt{\frac{precision^{2} + recall^{2}}{2}}` + The quadratic mean of precision and recall is defined as + + .. math:: + + \sqrt{\frac{precision^{2} + recall^{2}}{2}} Cf. https://en.wikipedia.org/wiki/Quadratic_mean @@ -816,14 +1231,20 @@ def pr_qmean(self): >>> ct.pr_qmean() 0.8290638930598233 + + .. versionadded:: 0.1.0 + """ return qmean((self.precision(), self.recall())) def pr_cmean(self): r"""Return contraharmonic mean of precision & recall. - The contraharmonic mean is: - :math:`\frac{precision^{2} + recall^{2}}{precision + recall}` + The contraharmonic mean is + + .. math:: + + \frac{precision^{2} + recall^{2}}{precision + recall} Cf. https://en.wikipedia.org/wiki/Contraharmonic_mean @@ -838,6 +1259,9 @@ def pr_cmean(self): >>> ct.pr_cmean() 0.8295566502463055 + + .. versionadded:: 0.1.0 + """ return cmean((self.precision(), self.recall())) @@ -847,8 +1271,12 @@ def pr_lmean(self): The logarithmic mean is: 0 if either precision or recall is 0, the precision if they are equal, - otherwise :math:`\frac{precision - recall} - {ln(precision) - ln(recall)}` + otherwise + + .. math:: + + \frac{precision - recall} + {ln(precision) - ln(recall)} Cf. https://en.wikipedia.org/wiki/Logarithmic_mean @@ -863,6 +1291,9 @@ def pr_lmean(self): >>> ct.pr_lmean() 0.8282429171492667 + + .. versionadded:: 0.1.0 + """ precision = self.precision() recall = self.recall() @@ -877,9 +1308,13 @@ def pr_imean(self): The identric mean is: precision if precision = recall, - otherwise :math:`\frac{1}{e} \cdot - \sqrt[precision - recall]{\frac{precision^{precision}} - {recall^{recall}}}` + otherwise + + .. math:: + + \frac{1}{e} \cdot + \sqrt[precision - recall]{\frac{precision^{precision}} + {recall^{recall}}} Cf. https://en.wikipedia.org/wiki/Identric_mean @@ -894,15 +1329,21 @@ def pr_imean(self): >>> ct.pr_imean() 0.8284071826325543 + + .. versionadded:: 0.1.0 + """ return imean((self.precision(), self.recall())) def pr_seiffert_mean(self): r"""Return Seiffert's mean of precision & recall. - Seiffert's mean of precision and recall is: - :math:`\frac{precision - recall}{4 \cdot arctan - \sqrt{\frac{precision}{recall}} - \pi}` + Seiffert's mean of precision and recall is + + .. math:: + + \frac{precision - recall}{4 \cdot arctan + \sqrt{\frac{precision}{recall}} - \pi} It is defined in :cite:`Seiffert:1993`. @@ -917,15 +1358,21 @@ def pr_seiffert_mean(self): >>> ct.pr_seiffert_mean() 0.8284071696048312 + + .. versionadded:: 0.1.0 + """ return seiffert_mean((self.precision(), self.recall())) def pr_lehmer_mean(self, exp=2.0): r"""Return Lehmer mean of precision & recall. - The Lehmer mean is: - :math:`\frac{precision^{exp} + recall^{exp}} - {precision^{exp-1} + recall^{exp-1}}` + The Lehmer mean is + + .. math:: + + \frac{precision^{exp} + recall^{exp}} + {precision^{exp-1} + recall^{exp-1}} Cf. https://en.wikipedia.org/wiki/Lehmer_mean @@ -946,14 +1393,20 @@ def pr_lehmer_mean(self, exp=2.0): >>> ct.pr_lehmer_mean() 0.8295566502463055 + + .. versionadded:: 0.1.0 + """ return lehmer_mean((self.precision(), self.recall()), exp) def pr_heronian_mean(self): r"""Return Heronian mean of precision & recall. - The Heronian mean of precision and recall is defined as: - :math:`\frac{precision + \sqrt{precision \cdot recall} + recall}{3}` + The Heronian mean of precision and recall is defined as + + .. math:: + + \frac{precision + \sqrt{precision \cdot recall} + recall}{3} Cf. https://en.wikipedia.org/wiki/Heronian_mean @@ -968,15 +1421,22 @@ def pr_heronian_mean(self): >>> ct.pr_heronian_mean() 0.8284071761178939 + + .. versionadded:: 0.1.0 + """ return heronian_mean((self.precision(), self.recall())) def pr_hoelder_mean(self, exp=2): r"""Return Hölder (power/generalized) mean of precision & recall. - The power mean of precision and recall is defined as: - :math:`\frac{1}{2} \cdot - \sqrt[exp]{precision^{exp} + recall^{exp}}` + The power mean of precision and recall is defined as + + .. math:: + + \frac{1}{2} \cdot + \sqrt[exp]{precision^{exp} + recall^{exp}} + for :math:`exp \ne 0`, and the geometric mean for :math:`exp = 0` Cf. https://en.wikipedia.org/wiki/Generalized_mean @@ -998,6 +1458,9 @@ def pr_hoelder_mean(self, exp=2): >>> ct.pr_hoelder_mean() 0.8290638930598233 + + .. versionadded:: 0.1.0 + """ return hoelder_mean((self.precision(), self.recall()), exp) @@ -1021,6 +1484,9 @@ def pr_agmean(self): >>> ct.pr_agmean() 0.8283250315702829 + + .. versionadded:: 0.1.0 + """ return agmean((self.precision(), self.recall())) @@ -1044,6 +1510,9 @@ def pr_ghmean(self): >>> ct.pr_ghmean() 0.8278323841238441 + + .. versionadded:: 0.1.0 + """ return ghmean((self.precision(), self.recall())) @@ -1066,6 +1535,9 @@ def pr_aghmean(self): >>> ct.pr_aghmean() 0.8280786712108288 + + .. versionadded:: 0.1.0 + """ return aghmean((self.precision(), self.recall())) @@ -1077,9 +1549,12 @@ def fbeta_score(self, beta=1.0): attaches :math:`\beta` times as much importance to recall as precision" (van Rijsbergen 1979) - :math:`F_{\beta}` score is defined as: - :math:`(1 + \beta^2) \cdot \frac{precision \cdot recall} - {((\beta^2 \cdot precision) + recall)}` + :math:`F_{\beta}` score is defined as + + .. math:: + + (1 + \beta^2) \cdot \frac{precision \cdot recall} + {((\beta^2 \cdot precision) + recall)} Cf. https://en.wikipedia.org/wiki/F1_score @@ -1106,13 +1581,16 @@ def fbeta_score(self, beta=1.0): >>> ct.fbeta_score(beta=0.1) 0.8565371024734982 + + .. versionadded:: 0.1.0 + """ - if beta <= 0: + if beta <= 0.0: raise AttributeError('Beta must be a positive real value.') precision = self.precision() recall = self.recall() return ( - (1 + beta ** 2) + (1.0 + beta ** 2) * precision * recall / ((beta ** 2 * precision) + recall) @@ -1137,6 +1615,9 @@ def f2_score(self): >>> ct.f2_score() 0.8108108108108109 + + .. versionadded:: 0.1.0 + """ return self.fbeta_score(2.0) @@ -1159,10 +1640,13 @@ def fhalf_score(self): >>> ct.fhalf_score() 0.8450704225352114 + + .. versionadded:: 0.1.0 + """ return self.fbeta_score(0.5) - def e_score(self, beta=1): + def e_score(self, beta=1.0): r"""Return :math:`E`-score. This is Van Rijsbergen's effectiveness measure: @@ -1186,14 +1670,20 @@ def e_score(self, beta=1): >>> ct.e_score() 0.17241379310344818 + + .. versionadded:: 0.1.0 + """ - return 1 - self.fbeta_score(beta) + return 1.0 - self.fbeta_score(beta) def f1_score(self): r"""Return :math:`F_{1}` score. - :math:`F_{1}` score is the harmonic mean of precision and recall: - :math:`2 \cdot \frac{precision \cdot recall}{precision + recall}` + :math:`F_{1}` score is the harmonic mean of precision and recall + + .. math:: + + 2 \cdot \frac{precision \cdot recall}{precision + recall} Cf. https://en.wikipedia.org/wiki/F1_score @@ -1206,16 +1696,28 @@ def f1_score(self): ------- >>> ct = ConfusionTable(120, 60, 20, 30) >>> ct.f1_score() - 0.8275862068965516 + 0.8275862068965518 - """ - return self.pr_hmean() + .. versionadded:: 0.1.0 + + """ + return self.fbeta_score(1.0) + + @deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the ConfusionTable.pr_hmean method instead.', + ) def f_measure(self): r"""Return :math:`F`-measure. - :math:`F`-measure is the harmonic mean of precision and recall: - :math:`2 \cdot \frac{precision \cdot recall}{precision + recall}` + :math:`F`-measure is the harmonic mean of precision and recall + + .. math:: + + 2 \cdot \frac{precision \cdot recall}{precision + recall} Cf. https://en.wikipedia.org/wiki/F1_score @@ -1230,14 +1732,55 @@ def f_measure(self): >>> ct.f_measure() 0.8275862068965516 + + .. versionadded:: 0.1.0 + """ return self.pr_hmean() + def jaccard(self): + r"""Return Jaccard index. + + The Jaccard index of a confusion table is + + .. math:: + + \frac{tp}{tp+fp+fn} + + Returns + ------- + float + The Jaccard index of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.jaccard() + 0.7058823529411765 + + + .. versionadded:: 0.4.0 + + """ + try: + return self._tp / (self._tp + self._fp + self._fn) + except ZeroDivisionError: + return float('nan') + + @deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the ConfusionTable.pr_gmean method instead.', + ) def g_measure(self): r"""Return G-measure. :math:`G`-measure is the geometric mean of precision and recall: - :math:`\sqrt{precision \cdot recall}` + + .. math:: + + \sqrt{precision \cdot recall} This is identical to the Fowlkes–Mallows (FM) index for two clusters. @@ -1257,16 +1800,50 @@ def g_measure(self): >>> ct.g_measure() 0.828078671210825 + + .. versionadded:: 0.1.0 + """ return self.pr_gmean() + def d_measure(self): + r"""Return D-measure. + + :math:`D`-measure is defined as + + .. math:: + + 1-\frac{1}{\frac{1}{precision}+\frac{1}{recall}-1} + + Returns + ------- + float + The :math:`D`-measure of the confusion table + + Examples + -------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.d_measure() + 0.2941176470588237 + + + .. versionadded:: 0.4.0 + + """ + return 1.0 - ( + 1.0 / (1.0 / self.precision() + 1.0 / self.recall() - 1.0) + ) + def mcc(self): r"""Return Matthews correlation coefficient (MCC). The Matthews correlation coefficient is defined in :cite:`Matthews:1975` as: - :math:`\frac{(tp \cdot tn) - (fp \cdot fn)} - {\sqrt{(tp + fp)(tp + fn)(tn + fp)(tn + fn)}}` + + .. math:: + + \frac{(tp \cdot tn) - (fp \cdot fn)} + {\sqrt{(tp + fp)(tp + fn)(tn + fp)(tn + fn)}} This is equivalent to the geometric mean of informedness and markedness, defined above. @@ -1284,30 +1861,30 @@ def mcc(self): >>> ct.mcc() 0.5367450401216932 + + .. versionadded:: 0.1.0 + """ - if ( - ( + try: + return ((self._tp * self._tn) - (self._fp * self._fn)) / math.sqrt( (self._tp + self._fp) * (self._tp + self._fn) * (self._tn + self._fp) * (self._tn + self._fn) ) - ) == 0: - return float('NaN') - return ((self._tp * self._tn) - (self._fp * self._fn)) / math.sqrt( - (self._tp + self._fp) - * (self._tp + self._fn) - * (self._tn + self._fp) - * (self._tn + self._fn) - ) + except ZeroDivisionError: + return float('nan') def significance(self): r"""Return the significance, :math:`\chi^{2}`. - Significance is defined as: - :math:`\chi^{2} = - \frac{(tp \cdot tn - fp \cdot fn)^{2} (tp + tn + fp + fn)} - {((tp + fp)(tp + fn)(tn + fp)(tn + fn)}` + Significance is defined as + + .. math:: + + \chi^{2} = + \frac{(tp \cdot tn - fp \cdot fn)^{2} (tp + tn + fp + fn)} + {((tp + fp)(tp + fn)(tn + fp)(tn + fn)}` Also: :math:`\chi^{2} = MCC^{2} \cdot n` @@ -1324,32 +1901,32 @@ def significance(self): >>> ct.significance() 66.26190476190476 + + .. versionadded:: 0.1.0 + """ - if ( - ( + try: + return ( + (self._tp * self._tn - self._fp * self._fn) ** 2 + * (self._tp + self._tn + self._fp + self._fn) + ) / ( (self._tp + self._fp) * (self._tp + self._fn) * (self._tn + self._fp) * (self._tn + self._fn) ) - ) == 0: - return float('NaN') - return ( - (self._tp * self._tn - self._fp * self._fn) ** 2 - * (self._tp + self._tn + self._fp + self._fn) - ) / ( - (self._tp + self._fp) - * (self._tp + self._fn) - * (self._tn + self._fp) - * (self._tn + self._fn) - ) + except ZeroDivisionError: + return float('nan') def kappa_statistic(self): r"""Return κ statistic. - The κ statistic is defined as: - :math:`\kappa = \frac{accuracy - random~ accuracy} - {1 - random~ accuracy}` + The κ statistic is defined as + + .. math:: + + \kappa = \frac{accuracy - random~ accuracy} + {1 - random~ accuracy}` The κ statistic compares the performance of the classifier relative to the performance of a random classifier. :math:`\kappa` = 0 indicates @@ -1368,14 +1945,301 @@ def kappa_statistic(self): >>> ct.kappa_statistic() 0.5344129554655871 + + .. versionadded:: 0.1.0 + + """ + try: + random_accuracy = ( + (self._tn + self._fp) * (self._tn + self._fn) + + (self._fn + self._tp) * (self._fp + self._tp) + ) / self.population() ** 2 + return (self.accuracy() - random_accuracy) / (1 - random_accuracy) + except ZeroDivisionError: + return float('nan') + + def phi_coefficient(self): + r"""Return φ coefficient. + + The :math:`\phi` coefficient is defined as + + .. math:: + + \phi = \frac{tp \cdot tn - fp \cdot tn} + {\sqrt{(tp + fp) \cdot (tp + fn) \cdot (tn + fp) \cdot + (tn + fn)}} + + Returns + ------- + float + The φ coefficient of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.phi_coefficient() + 0.5367450401216932 + + + .. versionadded:: 0.4.0 + + """ + try: + return ((self._tp * self._tn) - (self._fp * self._fn)) / ( + (self._tp + self._fn) + * (self._tp + self._fp) + * (self._tn + self._fn) + * (self._tn + self._fp) + ) ** 0.5 + except ZeroDivisionError: + return float('nan') + + def joint_entropy(self): + """Return the joint entropy. + + Implementation based on https://github.com/Magnetic/proficiency-metric + + Returns + ------- + float + The joint entropy of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.joint_entropy() + 1.1680347446270396 + + + .. versionadded:: 0.4.0 + + """ + try: + return ( + math.log(self.population()) + - sum(_ * math.log(_) for _ in self.to_tuple()) + / self.population() + ) + except ValueError: + return float('nan') + + def actual_entropy(self): + """Return the actual entropy. + + Implementation based on https://github.com/Magnetic/proficiency-metric + + Returns + ------- + float + The actual entropy of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.actual_entropy() + 0.6460905050608101 + + + .. versionadded:: 0.4.0 + + """ + try: + return ( + math.log(self.population()) + - sum( + _ * math.log(_) + for _ in (self.cond_pos_pop(), self.cond_neg_pop()) + ) + / self.population() + ) + except ValueError: + return float('nan') + + def predicted_entropy(self): + """Return the predicted entropy. + + Implementation based on https://github.com/Magnetic/proficiency-metric + + Returns + ------- + float + The predicted entropy of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.predicted_entropy() + 0.6693279632926457 + + + .. versionadded:: 0.4.0 + """ - if self.population() == 0: - return float('NaN') - random_accuracy = ( - (self._tn + self._fp) * (self._tn + self._fn) - + (self._fn + self._tp) * (self._fp + self._tp) - ) / self.population() ** 2 - return (self.accuracy() - random_accuracy) / (1 - random_accuracy) + try: + return ( + math.log(self.population()) + - sum( + _ * math.log(_) + for _ in (self.pred_pos_pop(), self.pred_neg_pop()) + ) + / self.population() + ) + except ValueError: + return float('nan') + + def mutual_information(self): + """Return the mutual information. + + Implementation based on https://github.com/Magnetic/proficiency-metric + + Returns + ------- + float + The mutual information of the confusion table + + Cf. https://en.wikipedia.org/wiki/Mutual_information + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.mutual_information() + 0.14738372372641576 + + + .. versionadded:: 0.4.0 + + """ + try: + return ( + sum( + _[0] * math.log(self.population() * _[0] / _[1]) + for _ in ( + ( + ( + self._fp, + self.cond_neg_pop() * self.pred_pos_pop(), + ), + ( + self._fn, + self.cond_pos_pop() * self.pred_neg_pop(), + ), + ( + self._tn, + self.cond_neg_pop() * self.pred_neg_pop(), + ), + ( + self._tp, + self.cond_pos_pop() * self.pred_pos_pop(), + ), + ) + ) + ) + / self.population() + ) + except ZeroDivisionError: + return float('nan') + + def proficiency(self): + """Return the proficiency. + + Implementation based on https://github.com/Magnetic/proficiency-metric + :cite:`Steingold:2015` + + AKA uncertainty coefficient + + Cf. https://en.wikipedia.org/wiki/Uncertainty_coefficient + + Returns + ------- + float + The proficiency of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.proficiency() + 0.228116219897929 + + + .. versionadded:: 0.4.0 + + """ + return self.mutual_information() / self.actual_entropy() + + def igr(self): + """Return information gain ratio. + + Implementation based on https://github.com/Magnetic/proficiency-metric + + Cf. https://en.wikipedia.org/wiki/Information_gain_ratio + + Returns + ------- + float + The information gain ratio of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.igr() + 0.22019657299448012 + + + .. versionadded:: 0.4.0 + + """ + return self.mutual_information() / self.predicted_entropy() + + def dependency(self): + """Return dependency. + + Implementation based on https://github.com/Magnetic/proficiency-metric + + Returns + ------- + float + The dependency of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.dependency() + 0.12618094145262454 + + + .. versionadded:: 0.4.0 + + """ + return self.mutual_information() / self.joint_entropy() + + def lift(self): + """Return lift. + + Implementation based on https://github.com/Magnetic/proficiency-metric + + Returns + ------- + float + The lift of the confusion table + + Example + ------- + >>> ct = ConfusionTable(120, 60, 20, 30) + >>> ct.lift() + 1.3142857142857143 + + + .. versionadded:: 0.4.0 + + """ + try: + return ( + self._tp + * self.population() + / (self.pred_pos_pop() * self.cond_pos_pop()) + ) + except ZeroDivisionError: + return float('nan') if __name__ == '__main__': diff --git a/abydos/stats/_mean.py b/abydos/stats/_mean.py index 13a6e0b29..9cd8d12e8 100644 --- a/abydos/stats/_mean.py +++ b/abydos/stats/_mean.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -62,8 +62,11 @@ def amean(nums): r"""Return arithmetic mean. - The arithmetic mean is defined as: - :math:`\frac{\sum{nums}}{|nums|}` + The arithmetic mean is defined as + + .. math:: + + \frac{\sum{nums}}{|nums|} Cf. https://en.wikipedia.org/wiki/Arithmetic_mean @@ -86,6 +89,8 @@ def amean(nums): >>> amean([0, 5, 1000]) 335.0 + .. versionadded:: 0.1.0 + """ return sum(nums) / len(nums) @@ -93,8 +98,11 @@ def amean(nums): def gmean(nums): r"""Return geometric mean. - The geometric mean is defined as: - :math:`\sqrt[|nums|]{\prod\limits_{i} nums_{i}}` + The geometric mean is defined as + + .. math:: + + \sqrt[|nums|]{\prod\limits_{i} nums_{i}} Cf. https://en.wikipedia.org/wiki/Geometric_mean @@ -117,6 +125,8 @@ def gmean(nums): >>> gmean([0, 5, 1000]) 0.0 + .. versionadded:: 0.1.0 + """ return _prod(nums) ** (1 / len(nums)) @@ -124,8 +134,11 @@ def gmean(nums): def hmean(nums): r"""Return harmonic mean. - The harmonic mean is defined as: - :math:`\frac{|nums|}{\sum\limits_{i}\frac{1}{nums_i}}` + The harmonic mean is defined as + + .. math:: + + \frac{|nums|}{\sum\limits_{i}\frac{1}{nums_i}} Following the behavior of Wolfram|Alpha: - If one of the values in nums is 0, return 0. @@ -145,7 +158,7 @@ def hmean(nums): Raises ------ - AttributeError + ValueError hmean requires at least one value Examples @@ -157,9 +170,11 @@ def hmean(nums): >>> hmean([0, 5, 1000]) 0 + .. versionadded:: 0.1.0 + """ if len(nums) < 1: - raise AttributeError('hmean requires at least one value') + raise ValueError('hmean requires at least one value') elif len(nums) == 1: return nums[0] else: @@ -179,8 +194,11 @@ def hmean(nums): def qmean(nums): r"""Return quadratic mean. - The quadratic mean of precision and recall is defined as: - :math:`\sqrt{\sum\limits_{i} \frac{num_i^2}{|nums|}}` + The quadratic mean is defined as + + .. math:: + + \sqrt{\sum\limits_{i} \frac{num_i^2}{|nums|}} Cf. https://en.wikipedia.org/wiki/Quadratic_mean @@ -203,6 +221,8 @@ def qmean(nums): >>> qmean([0, 5, 1000]) 577.3574860228857 + .. versionadded:: 0.1.0 + """ return (sum(i ** 2 for i in nums) / len(nums)) ** 0.5 @@ -210,8 +230,11 @@ def qmean(nums): def cmean(nums): r"""Return contraharmonic mean. - The contraharmonic mean is: - :math:`\frac{\sum\limits_i x_i^2}{\sum\limits_i x_i}` + The contraharmonic mean is + + .. math:: + + \frac{\sum\limits_i x_i^2}{\sum\limits_i x_i} Cf. https://en.wikipedia.org/wiki/Contraharmonic_mean @@ -234,6 +257,8 @@ def cmean(nums): >>> cmean([0, 5, 1000]) 995.0497512437811 + .. versionadded:: 0.1.0 + """ return sum(x ** 2 for x in nums) / sum(nums) @@ -243,11 +268,15 @@ def lmean(nums): The logarithmic mean of an arbitrarily long series is defined by http://www.survo.fi/papers/logmean.pdf - as: - :math:`L(x_1, x_2, ..., x_n) = - (n-1)! \sum\limits_{i=1}^n \frac{x_i} - {\prod\limits_{\substack{j = 1\\j \ne i}}^n - ln \frac{x_i}{x_j}}` + as + + + .. math:: + + L(x_1, x_2, ..., x_n) = + (n-1)! \sum\limits_{i=1}^n \frac{x_i} + {\prod\limits_{\substack{j = 1\\j \ne i}}^n + ln \frac{x_i}{x_j}} Cf. https://en.wikipedia.org/wiki/Logarithmic_mean @@ -263,7 +292,7 @@ def lmean(nums): Raises ------ - AttributeError + ValueError No two values in the nums list may be equal Examples @@ -273,17 +302,27 @@ def lmean(nums): >>> lmean([1, 2]) 1.4426950408889634 + .. versionadded:: 0.1.0 + """ - if len(nums) != len(set(nums)): - raise AttributeError('No two values in the nums list may be equal') - rolling_sum = 0 - for i in range(len(nums)): - rolling_prod = 1 - for j in range(len(nums)): - if i != j: - rolling_prod *= math.log(nums[i] / nums[j]) - rolling_sum += nums[i] / rolling_prod - return math.factorial(len(nums) - 1) * rolling_sum + if len(nums) == 2: + if nums[0] == nums[1]: + return float(nums[0]) + if 0 in nums: + return 0.0 + return (nums[1] - nums[0]) / (math.log(nums[1] / nums[0])) + + else: + if len(nums) != len(set(nums)): + raise ValueError('No two values in the nums list may be equal') + rolling_sum = 0 + for i in range(len(nums)): + rolling_prod = 1 + for j in range(len(nums)): + if i != j: + rolling_prod *= math.log(nums[i] / nums[j]) + rolling_sum += nums[i] / rolling_prod + return math.factorial(len(nums) - 1) * rolling_sum def imean(nums): @@ -291,7 +330,11 @@ def imean(nums): The identric mean of two numbers x and y is: x if x = y - otherwise :math:`\frac{1}{e} \sqrt[x-y]{\frac{x^x}{y^y}}` + otherwise + + .. math:: + + \frac{1}{e} \sqrt[x-y]{\frac{x^x}{y^y}} Cf. https://en.wikipedia.org/wiki/Identric_mean @@ -307,7 +350,7 @@ def imean(nums): Raises ------ - AttributeError + ValueError imean supports no more than two values Examples @@ -319,15 +362,18 @@ def imean(nums): >>> imean([2, 4]) 2.9430355293715387 + .. versionadded:: 0.1.0 + """ if len(nums) == 1: return nums[0] if len(nums) > 2: - raise AttributeError('imean supports no more than two values') + raise ValueError('imean supports no more than two values') if nums[0] <= 0 or nums[1] <= 0: return float('NaN') elif nums[0] == nums[1]: return nums[0] + nums = sorted(nums, reverse=True) return (1 / math.e) * (nums[0] ** nums[0] / nums[1] ** nums[1]) ** ( 1 / (nums[0] - nums[1]) ) @@ -336,8 +382,11 @@ def imean(nums): def seiffert_mean(nums): r"""Return Seiffert's mean. - Seiffert's mean of two numbers x and y is: - :math:`\frac{x - y}{4 \cdot arctan \sqrt{\frac{x}{y}} - \pi}` + Seiffert's mean of two numbers x and y is + + .. math:: + + \frac{x - y}{4 \cdot arctan \sqrt{\frac{x}{y}} - \pi} It is defined in :cite:`Seiffert:1993`. @@ -353,7 +402,7 @@ def seiffert_mean(nums): Raises ------ - AttributeError + ValueError seiffert_mean supports no more than two values Examples @@ -367,11 +416,13 @@ def seiffert_mean(nums): >>> seiffert_mean([2, 1000]) 336.84053300118825 + .. versionadded:: 0.1.0 + """ if len(nums) == 1: return nums[0] if len(nums) > 2: - raise AttributeError('seiffert_mean supports no more than two values') + raise ValueError('seiffert_mean supports no more than two values') if nums[0] + nums[1] == 0 or nums[0] - nums[1] == 0: return float('NaN') return (nums[0] - nums[1]) / ( @@ -382,8 +433,11 @@ def seiffert_mean(nums): def lehmer_mean(nums, exp=2): r"""Return Lehmer mean. - The Lehmer mean is: - :math:`\frac{\sum\limits_i{x_i^p}}{\sum\limits_i{x_i^(p-1)}}` + The Lehmer mean is + + .. math:: + + \frac{\sum\limits_i{x_i^p}}{\sum\limits_i{x_i^(p-1)}} Cf. https://en.wikipedia.org/wiki/Lehmer_mean @@ -408,6 +462,8 @@ def lehmer_mean(nums, exp=2): >>> lehmer_mean([0, 5, 1000]) 995.0497512437811 + .. versionadded:: 0.1.0 + """ return sum(x ** exp for x in nums) / sum(x ** (exp - 1) for x in nums) @@ -416,8 +472,12 @@ def heronian_mean(nums): r"""Return Heronian mean. The Heronian mean is: - :math:`\frac{\sum\limits_{i, j}\sqrt{{x_i \cdot x_j}}} - {|nums| \cdot \frac{|nums| + 1}{2}}` + + .. math:: + + \frac{\sum\limits_{i, j}\sqrt{{x_i \cdot x_j}}} + {|nums| \cdot \frac{|nums| + 1}{2}} + for :math:`j \ge i` Cf. https://en.wikipedia.org/wiki/Heronian_mean @@ -441,6 +501,8 @@ def heronian_mean(nums): >>> heronian_mean([0, 5, 1000]) 179.28511301977582 + .. versionadded:: 0.1.0 + """ mag = len(nums) rolling_sum = 0 @@ -457,7 +519,11 @@ def hoelder_mean(nums, exp=2): r"""Return Hölder (power/generalized) mean. The Hölder mean is defined as: - :math:`\sqrt[p]{\frac{1}{|nums|} \cdot \sum\limits_i{x_i^p}}` + + .. math:: + + \sqrt[p]{\frac{1}{|nums|} \cdot \sum\limits_i{x_i^p}} + for :math:`p \ne 0`, and the geometric mean for :math:`p = 0` Cf. https://en.wikipedia.org/wiki/Generalized_mean @@ -483,17 +549,19 @@ def hoelder_mean(nums, exp=2): >>> hoelder_mean([0, 5, 1000]) 577.3574860228857 + .. versionadded:: 0.1.0 + """ if exp == 0: return gmean(nums) return ((1 / len(nums)) * sum(i ** exp for i in nums)) ** (1 / exp) -def agmean(nums): +def agmean(nums, prec=12): """Return arithmetic-geometric mean. Iterates between arithmetic & geometric means until they converge to - a single value (rounded to 12 digits). + a single value (rounded to 10 digits). Cf. https://en.wikipedia.org/wiki/Arithmetic-geometric_mean @@ -506,6 +574,8 @@ def agmean(nums): ------- float The arithmetic-geometric mean of nums + prec : int + Digits of precision when testing convergeance Examples -------- @@ -516,21 +586,23 @@ def agmean(nums): >>> agmean([0, 5, 1000]) 2.9753977059954195e-13 + .. versionadded:: 0.1.0 + """ m_a = amean(nums) m_g = gmean(nums) if math.isnan(m_a) or math.isnan(m_g): return float('nan') - while round(m_a, 12) != round(m_g, 12): + while round(m_a, prec) != round(m_g, prec): m_a, m_g = (m_a + m_g) / 2, (m_a * m_g) ** (1 / 2) return m_a -def ghmean(nums): +def ghmean(nums, prec=12): """Return geometric-harmonic mean. Iterates between geometric & harmonic means until they converge to - a single value (rounded to 12 digits). + a single value (rounded to 10 digits). Cf. https://en.wikipedia.org/wiki/Geometric-harmonic_mean @@ -538,6 +610,8 @@ def ghmean(nums): ---------- nums : list A series of numbers + prec : int + Digits of precision when testing convergeance Returns ------- @@ -558,27 +632,31 @@ def ghmean(nums): >>> ghmean([0, 0, 5]) nan + .. versionadded:: 0.1.0 + """ m_g = gmean(nums) m_h = hmean(nums) if math.isnan(m_g) or math.isnan(m_h): return float('nan') - while round(m_h, 12) != round(m_g, 12): + while round(m_h, prec) != round(m_g, prec): m_g, m_h = (m_g * m_h) ** (1 / 2), (2 * m_g * m_h) / (m_g + m_h) return m_g -def aghmean(nums): +def aghmean(nums, prec=12): """Return arithmetic-geometric-harmonic mean. Iterates over arithmetic, geometric, & harmonic means until they - converge to a single value (rounded to 12 digits), following the + converge to a single value (rounded to 10 digits), following the method described in :cite:`Raissouli:2009`. Parameters ---------- nums : list A series of numbers + prec : int + Digits of precision when testing convergeance Returns ------- @@ -594,14 +672,16 @@ def aghmean(nums): >>> aghmean([0, 5, 1000]) 335.0 + .. versionadded:: 0.1.0 + """ m_a = amean(nums) m_g = gmean(nums) m_h = hmean(nums) if math.isnan(m_a) or math.isnan(m_g) or math.isnan(m_h): return float('nan') - while round(m_a, 12) != round(m_g, 12) and round(m_g, 12) != round( - m_h, 12 + while round(m_a, prec) != round(m_g, prec) and round(m_g, prec) != round( + m_h, prec ): m_a, m_g, m_h = ( (m_a + m_g + m_h) / 3, @@ -637,6 +717,8 @@ def midrange(nums): >>> midrange([1, 2, 1000, 3]) 500.5 + .. versionadded:: 0.1.0 + """ return 0.5 * (max(nums) + min(nums)) @@ -669,6 +751,8 @@ def median(nums): >>> median([1, 2, 2, 4]) 2 + .. versionadded:: 0.1.0 + """ nums = sorted(nums) mag = len(nums) @@ -702,6 +786,8 @@ def mode(nums): >>> mode([1, 2, 2, 3]) 2 + .. versionadded:: 0.1.0 + """ return Counter(nums).most_common(1)[0][0] @@ -712,7 +798,9 @@ def var(nums, mean_func=amean, ddof=0): The variance (:math:`\sigma^2`) of a series of numbers (:math:`x_i`) with mean :math:`\mu` and population :math:`N` is: - :math:`\sigma^2 = \frac{1}{N}\sum_{i=1}^{N}(x_i-\mu)^2`. + .. math:: + + \sigma^2 = \frac{1}{N}\sum_{i=1}^{N}(x_i-\mu)^2 Cf. https://en.wikipedia.org/wiki/Variance @@ -739,6 +827,8 @@ def var(nums, mean_func=amean, ddof=0): >>> round(var([1, 2, 3, 4], ddof=1), 12) 1.666666666667 + .. versionadded:: 0.3.0 + """ x_bar = mean_func(nums) return sum((x - x_bar) ** 2 for x in nums) / (len(nums) - ddof) @@ -775,6 +865,8 @@ def std(nums, mean_func=amean, ddof=0): >>> round(std([1, 2, 3, 4], ddof=1), 12) 1.290994448736 + .. versionadded:: 0.3.0 + """ return var(nums, mean_func, ddof) ** 0.5 diff --git a/abydos/stats/_pairwise.py b/abydos/stats/_pairwise.py index d5970a4b9..fabe43948 100644 --- a/abydos/stats/_pairwise.py +++ b/abydos/stats/_pairwise.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -31,7 +31,7 @@ from six.moves import range from ._mean import amean, hmean, std -from ..distance import sim +from ..distance._levenshtein import sim_levenshtein as sim __all__ = ['mean_pairwise_similarity', 'pairwise_similarity_statistics'] @@ -81,6 +81,8 @@ def mean_pairwise_similarity( >>> round(mean_pairwise_similarity(['Niall', 'Neal', 'Neil']), 12) 0.545454545455 + .. versionadded:: 0.1.0 + """ if not callable(mean_func): raise ValueError('mean_func must be a function') @@ -157,6 +159,8 @@ def pairwise_similarity_statistics( ... ['Christopher', 'Kristof', 'Christobal'], ['Niall', 'Neal', 'Neil'])) (0.2, 0.0, 0.118614718615, 0.075070477184) + .. versionadded:: 0.3.0 + """ if not callable(mean_func): raise ValueError('mean_func must be a function') diff --git a/abydos/stemmer/__init__.py b/abydos/stemmer/__init__.py index 955ffa4cb..af6c3fef7 100644 --- a/abydos/stemmer/__init__.py +++ b/abydos/stemmer/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -89,14 +89,18 @@ from ._porter2 import Porter2, porter2 from ._s_stemmer import SStemmer, s_stemmer from ._schinke import Schinke, schinke +from ._snowball import _Snowball from ._snowball_danish import SnowballDanish, sb_danish from ._snowball_dutch import SnowballDutch, sb_dutch from ._snowball_german import SnowballGerman, sb_german from ._snowball_norwegian import SnowballNorwegian, sb_norwegian from ._snowball_swedish import SnowballSwedish, sb_swedish +from ._stemmer import _Stemmer from ._uea_lite import UEALite, uealite __all__ = [ + '_Stemmer', + '_Snowball', 'Lovins', 'lovins', 'PaiceHusk', diff --git a/abydos/stemmer/_caumanns.py b/abydos/stemmer/_caumanns.py index 16db7c79b..18304d35f 100644 --- a/abydos/stemmer/_caumanns.py +++ b/abydos/stemmer/_caumanns.py @@ -30,10 +30,13 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['Caumanns', 'caumanns'] @@ -46,6 +49,8 @@ class Caumanns(_Stemmer): This implementation is based on the GermanStemFilter described at :cite:`Lang:2013`. + + .. versionadded:: 0.3.6 """ _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou')) @@ -73,6 +78,11 @@ def stem(self, word): >>> stmr.stem('buchstabieren') 'buchstabier' + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if not word: return '' @@ -146,6 +156,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Caumanns.stem method instead.', +) def caumanns(word): """Return Caumanns German stem. @@ -170,6 +186,8 @@ def caumanns(word): >>> caumanns('buchstabieren') 'buchstabier' + .. versionadded:: 0.2.0 + """ return Caumanns().stem(word) diff --git a/abydos/stemmer/_clef_german.py b/abydos/stemmer/_clef_german.py index 60b117703..fae875553 100644 --- a/abydos/stemmer/_clef_german.py +++ b/abydos/stemmer/_clef_german.py @@ -30,9 +30,12 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['CLEFGerman', 'clef_german'] @@ -41,6 +44,8 @@ class CLEFGerman(_Stemmer): """CLEF German stemmer. The CLEF German stemmer is defined at :cite:`Savoy:2005`. + + .. versionadded:: 0.3.6 """ _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou')) @@ -68,6 +73,11 @@ def stem(self, word): >>> stmr.stem('buchstabieren') 'buchstabier' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) @@ -90,6 +100,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the CLEFGerman.stem method instead.', +) def clef_german(word): """Return CLEF German stem. @@ -114,6 +130,8 @@ def clef_german(word): >>> clef_german('buchstabieren') 'buchstabier' + .. versionadded:: 0.1.0 + """ return CLEFGerman().stem(word) diff --git a/abydos/stemmer/_clef_german_plus.py b/abydos/stemmer/_clef_german_plus.py index 001d48233..3e379118a 100644 --- a/abydos/stemmer/_clef_german_plus.py +++ b/abydos/stemmer/_clef_german_plus.py @@ -30,9 +30,12 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['CLEFGermanPlus', 'clef_german_plus'] @@ -41,6 +44,8 @@ class CLEFGermanPlus(_Stemmer): """CLEF German stemmer plus. The CLEF German stemmer plus is defined at :cite:`Savoy:2005`. + + .. versionadded:: 0.3.6 """ _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'} @@ -72,6 +77,11 @@ def stem(self, word): >>> clef_german_plus('buchstabieren') 'buchstabi' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) @@ -104,6 +114,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the CLEFGermanPlus.stem method instead.', +) def clef_german_plus(word): """Return 'CLEF German stemmer plus' stem. @@ -129,6 +145,8 @@ def clef_german_plus(word): >>> clef_german_plus('buchstabieren') 'buchstabi' + .. versionadded:: 0.1.0 + """ return CLEFGermanPlus().stem(word) diff --git a/abydos/stemmer/_clef_swedish.py b/abydos/stemmer/_clef_swedish.py index 4976bfd28..f80ece3ad 100644 --- a/abydos/stemmer/_clef_swedish.py +++ b/abydos/stemmer/_clef_swedish.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['CLEFSwedish', 'clef_swedish'] @@ -37,6 +40,8 @@ class CLEFSwedish(_Stemmer): """CLEF Swedish stemmer. The CLEF Swedish stemmer is defined at :cite:`Savoy:2005`. + + .. versionadded:: 0.3.6 """ def stem(self, word): @@ -61,6 +66,11 @@ def stem(self, word): >>> clef_swedish('visshet') 'viss' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ wlen = len(word) - 2 @@ -82,6 +92,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the CLEFSwedish.stem method instead.', +) def clef_swedish(word): """Return CLEF Swedish stem. @@ -106,6 +122,8 @@ def clef_swedish(word): >>> clef_swedish('visshet') 'viss' + .. versionadded:: 0.1.0 + """ return CLEFSwedish().stem(word) diff --git a/abydos/stemmer/_lovins.py b/abydos/stemmer/_lovins.py index bddf80a50..f43e69f39 100644 --- a/abydos/stemmer/_lovins.py +++ b/abydos/stemmer/_lovins.py @@ -30,10 +30,13 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['Lovins', 'lovins'] @@ -43,6 +46,8 @@ class Lovins(_Stemmer): The Lovins stemmer is described in Julie Beth Lovins's article :cite:`Lovins:1968`. + + .. versionadded:: 0.3.6 """ def _cond_b(self, word, suffix_len): @@ -60,6 +65,11 @@ def _cond_b(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return len(word) - suffix_len >= 3 @@ -78,6 +88,11 @@ def _cond_c(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return len(word) - suffix_len >= 4 @@ -96,6 +111,11 @@ def _cond_d(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return len(word) - suffix_len >= 5 @@ -114,6 +134,11 @@ def _cond_e(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] != 'e' @@ -132,6 +157,11 @@ def _cond_f(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return len(word) - suffix_len >= 3 and word[-suffix_len - 1] != 'e' @@ -150,6 +180,11 @@ def _cond_g(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return len(word) - suffix_len >= 3 and word[-suffix_len - 1] == 'f' @@ -168,6 +203,11 @@ def _cond_h(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return ( word[-suffix_len - 1] == 't' @@ -189,6 +229,11 @@ def _cond_i(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] not in {'e', 'o'} @@ -207,6 +252,11 @@ def _cond_j(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] not in {'a', 'e'} @@ -225,6 +275,11 @@ def _cond_k(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return (len(word) - suffix_len >= 3) and ( word[-suffix_len - 1] in {'i', 'l'} @@ -246,6 +301,11 @@ def _cond_l(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return ( word[-suffix_len - 1] not in {'s', 'u', 'x'} @@ -267,6 +327,11 @@ def _cond_m(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] not in {'a', 'c', 'e', 'm'} @@ -285,6 +350,11 @@ def _cond_n(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if len(word) - suffix_len >= 3: if word[-suffix_len - 3] == 's': @@ -309,6 +379,11 @@ def _cond_o(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] in {'i', 'l'} @@ -327,6 +402,11 @@ def _cond_p(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] != 'c' @@ -345,6 +425,11 @@ def _cond_q(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return len(word) - suffix_len >= 3 and word[-suffix_len - 1] not in { 'l', @@ -366,6 +451,11 @@ def _cond_r(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] in {'n', 'r'} @@ -384,6 +474,11 @@ def _cond_s(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 2 : -suffix_len] == 'dr' or ( word[-suffix_len - 1] == 't' @@ -405,6 +500,11 @@ def _cond_t(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return ( word[-suffix_len - 1] in {'s', 't'} @@ -426,6 +526,11 @@ def _cond_u(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] in {'l', 'm', 'n', 'r'} @@ -444,6 +549,11 @@ def _cond_v(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] == 'c' @@ -462,6 +572,11 @@ def _cond_w(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] not in {'s', 'u'} @@ -480,6 +595,11 @@ def _cond_x(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] in {'i', 'l'} or ( word[-suffix_len - 3 : -suffix_len] == 'u' @@ -501,6 +621,11 @@ def _cond_y(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 2 : -suffix_len] == 'in' @@ -519,6 +644,11 @@ def _cond_z(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] != 'f' @@ -537,6 +667,11 @@ def _cond_aa(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] in {'d', 'f', 'l', 't'} or word[ -suffix_len - 2 : -suffix_len @@ -557,6 +692,11 @@ def _cond_bb(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return ( len(word) - suffix_len >= 3 @@ -579,6 +719,11 @@ def _cond_cc(self, word, suffix_len): bool True if condition is met + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return word[-suffix_len - 1] == 'l' @@ -595,6 +740,11 @@ def _recode9(self, stem): str Word stripped of suffix + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if stem[-3:-2] in {'a', 'i', 'o'}: return stem @@ -613,6 +763,11 @@ def _recode24(self, stem): str Word stripped of suffix + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if stem[-4:-3] == 's': return stem @@ -631,6 +786,11 @@ def _recode28(self, stem): str Word stripped of suffix + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if stem[-4:-3] in {'p', 't'}: return stem @@ -649,6 +809,11 @@ def _recode30(self, stem): str Word stripped of suffix + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if stem[-4:-3] == 'm': return stem @@ -667,6 +832,11 @@ def _recode32(self, stem): str Word stripped of suffix + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if stem[-3:-2] == 'n': return stem @@ -676,7 +846,11 @@ def _recode32(self, stem): _recode = () def __init__(self): - """Initialize the stemmer.""" + """Initialize the stemmer. + + .. versionadded:: 0.3.6 + + """ self._suffix = { 'alistically': self._cond_b, 'arizability': None, @@ -964,8 +1138,8 @@ def __init__(self): 'um': self._cond_u, 'us': self._cond_v, 'yl': self._cond_r, - '\'s': None, - 's\'': None, + "'s": None, + "s'": None, 'a': None, 'e': None, 'i': None, @@ -1034,6 +1208,11 @@ def stem(self, word): >>> stmr.stem('elusiveness') 'elus' + + .. versionadded:: 0.2.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) @@ -1075,6 +1254,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Lovins.stem method instead.', +) def lovins(word): """Return Lovins stem. @@ -1098,6 +1283,8 @@ def lovins(word): >>> lovins('elusiveness') 'elus' + .. versionadded:: 0.2.0 + """ return Lovins().stem(word) diff --git a/abydos/stemmer/_paice_husk.py b/abydos/stemmer/_paice_husk.py index 4d7e2377b..3ccd623cd 100644 --- a/abydos/stemmer/_paice_husk.py +++ b/abydos/stemmer/_paice_husk.py @@ -28,9 +28,12 @@ unicode_literals, ) +from deprecation import deprecated + from six.moves import range from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['PaiceHusk', 'paice_husk'] @@ -42,6 +45,8 @@ class PaiceHusk(_Stemmer): Stemmer, developed by Chris Paice, with the assistance of Gareth Husk This is based on the algorithm's description in :cite:`Paice:1990`. + + .. versionadded:: 0.3.6 """ _rule_table = { @@ -225,6 +230,11 @@ def stem(self, word): >>> stmr.stem('torment') 'tor' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ terminate = False intact = True @@ -256,6 +266,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the PaiceHusk.stem method instead.', +) def paice_husk(word): """Return Paice-Husk stem. @@ -284,6 +300,8 @@ def paice_husk(word): >>> paice_husk('torment') 'tor' + .. versionadded:: 0.3.0 + """ return PaiceHusk().stem(word) diff --git a/abydos/stemmer/_porter.py b/abydos/stemmer/_porter.py index d6aa77f5b..ce73fdd0f 100644 --- a/abydos/stemmer/_porter.py +++ b/abydos/stemmer/_porter.py @@ -30,10 +30,13 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['Porter', 'porter'] @@ -42,6 +45,8 @@ class Porter(_Stemmer): """Porter stemmer. The Porter stemmer is described in :cite:`Porter:1980`. + + .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y'} @@ -61,6 +66,11 @@ def _m_degree(self, term): int The m-degree as defined in the Porter stemmer definition + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ mdeg = 0 last_was_vowel = False @@ -87,6 +97,11 @@ def _has_vowel(self, term): True iff a vowel exists in the term (as defined in the Porter stemmer definition) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ for letter in term: if letter in self._vowels: @@ -107,6 +122,11 @@ def _ends_in_doubled_cons(self, term): True iff the stem ends in a doubled consonant (as defined in the Porter stemmer definition) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return ( len(term) > 1 @@ -128,6 +148,11 @@ def _ends_in_cvc(self, term): True iff the stem ends in cvc (as defined in the Porter stemmer definition) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ return len(term) > 2 and ( term[-1] not in self._vowels @@ -136,17 +161,29 @@ def _ends_in_cvc(self, term): and term[-1] not in tuple('wxY') ) - def stem(self, word, early_english=False): - """Return Porter stem. + def __init__(self, early_english=False): + """Initialize Porter instance. Parameters ---------- - word : str - The word to stem early_english : bool Set to True in order to remove -eth & -est (2nd & 3rd person singular verbal agreement suffixes) + + .. versionadded:: 0.4.0 + + """ + self._early_english = early_english + + def stem(self, word): + """Return Porter stem. + + Parameters + ---------- + word : str + The word to stem + Returns ------- str @@ -162,9 +199,15 @@ def stem(self, word, early_english=False): >>> stmr.stem('elusiveness') 'elus' - >>> stmr.stem('eateth', early_english=True) + >>> stmr = Porter(early_english=True) + >>> stmr.stem('eateth') 'eat' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) @@ -204,7 +247,7 @@ def stem(self, word, early_english=False): if self._has_vowel(word[:-3]): word = word[:-3] step1b_flag = True - elif early_english: + elif self._early_english: if word[-3:] == 'est': if self._has_vowel(word[:-3]): word = word[:-3] @@ -371,6 +414,12 @@ def stem(self, word, early_english=False): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Porter.stem method instead.', +) def porter(word, early_english=False): """Return Porter stem. @@ -401,8 +450,10 @@ def porter(word, early_english=False): >>> porter('eateth', early_english=True) 'eat' + .. versionadded:: 0.1.0 + """ - return Porter().stem(word, early_english) + return Porter(early_english).stem(word) if __name__ == '__main__': diff --git a/abydos/stemmer/_porter2.py b/abydos/stemmer/_porter2.py index 2349a2ff6..b40b4eb9e 100644 --- a/abydos/stemmer/_porter2.py +++ b/abydos/stemmer/_porter2.py @@ -30,10 +30,13 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._snowball import _Snowball +from .. import __version__ __all__ = ['Porter2', 'porter2'] @@ -42,6 +45,8 @@ class Porter2(_Snowball): """Porter2 (Snowball English) stemmer. The Porter2 (Snowball English) stemmer is defined in :cite:`Porter:2002`. + + .. versionadded:: 0.3.6 """ _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'} @@ -84,17 +89,29 @@ class Porter2(_Snowball): 'succeed', } - def stem(self, word, early_english=False): - """Return the Porter2 (Snowball English) stem. + def __init__(self, early_english=False): + """Initialize Porter2 instance. Parameters ---------- - word : str - The word to stem early_english : bool Set to True in order to remove -eth & -est (2nd & 3rd person singular verbal agreement suffixes) + + .. versionadded:: 0.4.0 + + """ + self._early_english = early_english + + def stem(self, word): + """Return the Porter2 (Snowball English) stem. + + Parameters + ---------- + word : str + The word to stem + Returns ------- str @@ -110,16 +127,22 @@ def stem(self, word, early_english=False): >>> stmr.stem('elusiveness') 'elus' - >>> stmr.stem('eateth', early_english=True) + >>> stmr = Porter2(early_english=True) + >>> stmr.stem('eateth') 'eat' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) # replace apostrophe-like characters with U+0027, per # http://snowball.tartarus.org/texts/apostrophe.html - word = word.replace('’', '\'') - word = word.replace('’', '\'') + word = word.replace('’', "'") + word = word.replace('’', "'") # Exceptions 1 if word in self._exception1dict: @@ -132,7 +155,7 @@ def stem(self, word, early_english=False): return word # Remove initial ', if present. - while word and word[0] == '\'': + while word and word[0] == "'": word = word[1:] # Return word if stem is shorter than 2 if len(word) < 2: @@ -149,11 +172,11 @@ def stem(self, word, early_english=False): r2_start = self._sb_r2(word, self._r1_prefixes) # Step 0 - if word[-3:] == '\'s\'': + if word[-3:] == "'s'": word = word[:-3] - elif word[-2:] == '\'s': + elif word[-2:] == "'s": word = word[:-2] - elif word[-1:] == '\'': + elif word[-1:] == "'": word = word[:-1] # Return word if stem is shorter than 2 if len(word) < 3: @@ -201,7 +224,7 @@ def stem(self, word, early_english=False): if self._sb_has_vowel(word[:-2]): word = word[:-2] step1b_flag = True - elif early_english: + elif self._early_english: if word[-3:] == 'est': if self._sb_has_vowel(word[:-3]): word = word[:-3] @@ -377,6 +400,12 @@ def stem(self, word, early_english=False): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Porter2.stem method instead.', +) def porter2(word, early_english=False): """Return the Porter2 (Snowball English) stem. @@ -407,8 +436,10 @@ def porter2(word, early_english=False): >>> porter2('eateth', early_english=True) 'eat' + .. versionadded:: 0.1.0 + """ - return Porter2().stem(word, early_english) + return Porter2(early_english).stem(word) if __name__ == '__main__': diff --git a/abydos/stemmer/_s_stemmer.py b/abydos/stemmer/_s_stemmer.py index 97d49d8d6..0b2ed25ad 100644 --- a/abydos/stemmer/_s_stemmer.py +++ b/abydos/stemmer/_s_stemmer.py @@ -28,7 +28,10 @@ unicode_literals, ) +from deprecation import deprecated + from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['SStemmer', 's_stemmer'] @@ -37,6 +40,8 @@ class SStemmer(_Stemmer): """S-stemmer. The S stemmer is defined in :cite:`Harman:1991`. + + .. versionadded:: 0.3.6 """ def stem(self, word): @@ -66,6 +71,11 @@ def stem(self, word): >>> stmr.stem('census') 'census' + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ lowered = word.lower() if lowered[-3:] == 'ies' and lowered[-4:-3] not in {'e', 'a'}: @@ -77,6 +87,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SStemmer.stem method instead.', +) def s_stemmer(word): """Return the S-stemmed form of a word. @@ -105,6 +121,8 @@ def s_stemmer(word): >>> s_stemmer('census') 'census' + .. versionadded:: 0.3.0 + """ return SStemmer().stem(word) diff --git a/abydos/stemmer/_schinke.py b/abydos/stemmer/_schinke.py index 002e4f6e7..a187d1b84 100644 --- a/abydos/stemmer/_schinke.py +++ b/abydos/stemmer/_schinke.py @@ -30,10 +30,13 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['Schinke', 'schinke'] @@ -42,6 +45,8 @@ class Schinke(_Stemmer): """Schinke stemmer. This is defined in :cite:`Schinke:1996`. + + .. versionadded:: 0.3.6 """ _keep_que = { @@ -165,6 +170,11 @@ def stem(self, word): >>> stmr.stem('senatus') {'n': 'senat', 'v': 'senatu'} + + .. versionadded:: 0.3.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ word = normalize('NFKD', text_type(word.lower())) word = ''.join( @@ -261,6 +271,12 @@ def stem(self, word): return {'n': noun, 'v': verb} +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the Schinke.stem method instead.', +) def schinke(word): """Return the stem of a word according to the Schinke stemmer. @@ -289,6 +305,8 @@ def schinke(word): >>> schinke('senatus') {'n': 'senat', 'v': 'senatu'} + .. versionadded:: 0.3.0 + """ return Schinke().stem(word) diff --git a/abydos/stemmer/_snowball.py b/abydos/stemmer/_snowball.py index bbeced01c..b9c6d60e8 100644 --- a/abydos/stemmer/_snowball.py +++ b/abydos/stemmer/_snowball.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -32,12 +32,17 @@ from ._stemmer import _Stemmer +__all__ = ['_Snowball'] + class _Snowball(_Stemmer): - """Snowball stemmer base class.""" + """Snowball stemmer base class. + + .. versionadded:: 0.3.6 + """ _vowels = set('aeiouy') - _codanonvowels = set('\'bcdfghjklmnpqrstvz') + _codanonvowels = set("'bcdfghjklmnpqrstvz") def _sb_r1(self, term, r1_prefixes=None): """Return the R1 region, as defined in the Porter2 specification. @@ -54,6 +59,11 @@ def _sb_r1(self, term, r1_prefixes=None): int Length of the R1 region + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ vowel_found = False if hasattr(r1_prefixes, '__iter__'): @@ -83,6 +93,11 @@ def _sb_r2(self, term, r1_prefixes=None): int Length of the R1 region + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ r1_start = self._sb_r1(term, r1_prefixes) return r1_start + self._sb_r1(term[r1_start:]) @@ -105,6 +120,11 @@ def _sb_ends_in_short_syllable(self, term): bool True iff term ends in a short syllable + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if not term: return False @@ -137,6 +157,11 @@ def _sb_short_word(self, term, r1_prefixes=None): bool True iff term is a short word + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if self._sb_r1(term, r1_prefixes) == len( term @@ -158,6 +183,11 @@ def _sb_has_vowel(self, term): True iff a vowel exists in the term (as defined in the Porter stemmer definition) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ for letter in term: if letter in self._vowels: diff --git a/abydos/stemmer/_snowball_danish.py b/abydos/stemmer/_snowball_danish.py index bcb01ee36..7757bd584 100644 --- a/abydos/stemmer/_snowball_danish.py +++ b/abydos/stemmer/_snowball_danish.py @@ -30,9 +30,12 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from ._snowball import _Snowball +from .. import __version__ __all__ = ['SnowballDanish', 'sb_danish'] @@ -42,6 +45,8 @@ class SnowballDanish(_Snowball): The Snowball Danish stemmer is defined at: http://snowball.tartarus.org/algorithms/danish/stemmer.html + + .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'} @@ -91,6 +96,11 @@ def stem(self, word): >>> stmr.stem('sikkerhed') 'sikker' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) @@ -174,6 +184,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SnowballDanish.stem method instead.', +) def sb_danish(word): """Return Snowball Danish stem. @@ -198,6 +214,8 @@ def sb_danish(word): >>> sb_danish('sikkerhed') 'sikker' + .. versionadded:: 0.1.0 + """ return SnowballDanish().stem(word) diff --git a/abydos/stemmer/_snowball_dutch.py b/abydos/stemmer/_snowball_dutch.py index b724ba324..3da450856 100644 --- a/abydos/stemmer/_snowball_dutch.py +++ b/abydos/stemmer/_snowball_dutch.py @@ -30,10 +30,13 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from six.moves import range from ._snowball import _Snowball +from .. import __version__ __all__ = ['SnowballDutch', 'sb_dutch'] @@ -43,6 +46,8 @@ class SnowballDutch(_Snowball): The Snowball Dutch stemmer is defined at: http://snowball.tartarus.org/algorithms/dutch/stemmer.html + + .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'} @@ -62,6 +67,11 @@ def _undouble(self, word): str The word with doubled endings undoubled + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ if ( len(word) > 1 @@ -94,6 +104,11 @@ def stem(self, word): >>> stmr.stem('ongrijpbaarheid') 'ongrijp' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, decompose, filter umlauts & acutes out, and # compose @@ -213,6 +228,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SnowballDutch.stem method instead.', +) def sb_dutch(word): """Return Snowball Dutch stem. @@ -237,6 +258,8 @@ def sb_dutch(word): >>> sb_dutch('ongrijpbaarheid') 'ongrijp' + .. versionadded:: 0.1.0 + """ return SnowballDutch().stem(word) diff --git a/abydos/stemmer/_snowball_german.py b/abydos/stemmer/_snowball_german.py index 83cac17ba..0378bbdbd 100644 --- a/abydos/stemmer/_snowball_german.py +++ b/abydos/stemmer/_snowball_german.py @@ -30,9 +30,12 @@ from unicodedata import normalize +from deprecation import deprecated + from six.moves import range from ._snowball import _Snowball +from .. import __version__ __all__ = ['SnowballGerman', 'sb_german'] @@ -42,21 +45,35 @@ class SnowballGerman(_Snowball): The Snowball German stemmer is defined at: http://snowball.tartarus.org/algorithms/german/stemmer.html + + .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'} _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'} _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'} - def stem(self, word, alternate_vowels=False): + def __init__(self, alternate_vowels=False): + """Initialize SnowballGerman instance. + + Parameters + ---------- + alternate_vowels : bool + Composes ae as ä, oe as ö, and ue as ü before running the algorithm + + + .. versionadded:: 0.4.0 + + """ + self._alternate_vowels = alternate_vowels + + def stem(self, word): """Return Snowball German stem. Parameters ---------- word : str The word to stem - alternate_vowels : bool - Composes ae as ä, oe as ö, and ue as ü before running the algorithm Returns ------- @@ -73,6 +90,11 @@ def stem(self, word, alternate_vowels=False): >>> stmr.stem('buchstabieren') 'buchstabi' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', word.lower()) @@ -86,7 +108,7 @@ def stem(self, word, alternate_vowels=False): elif word[i - 1] == 'y': word = word[: i - 1] + 'Y' + word[i:] - if alternate_vowels: + if self._alternate_vowels: word = word.replace('ae', 'ä') word = word.replace('oe', 'ö') word = word.replace('que', 'Q') @@ -191,6 +213,12 @@ def stem(self, word, alternate_vowels=False): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SnowballGerman.stem method instead.', +) def sb_german(word, alternate_vowels=False): """Return Snowball German stem. @@ -217,8 +245,10 @@ def sb_german(word, alternate_vowels=False): >>> sb_german('buchstabieren') 'buchstabi' + .. versionadded:: 0.1.0 + """ - return SnowballGerman().stem(word, alternate_vowels) + return SnowballGerman(alternate_vowels).stem(word) if __name__ == '__main__': diff --git a/abydos/stemmer/_snowball_norwegian.py b/abydos/stemmer/_snowball_norwegian.py index ef3a1a1ac..9840e4b34 100644 --- a/abydos/stemmer/_snowball_norwegian.py +++ b/abydos/stemmer/_snowball_norwegian.py @@ -30,9 +30,12 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from ._snowball import _Snowball +from .. import __version__ __all__ = ['SnowballNorwegian', 'sb_norwegian'] @@ -42,6 +45,8 @@ class SnowballNorwegian(_Snowball): The Snowball Norwegian stemmer is defined at: http://snowball.tartarus.org/algorithms/norwegian/stemmer.html + + .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'} @@ -88,6 +93,11 @@ def stem(self, word): >>> stmr.stem('sikkerhet') 'sikker' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) @@ -152,6 +162,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SnowballNorwegian.stem method instead.', +) def sb_norwegian(word): """Return Snowball Norwegian stem. @@ -176,6 +192,8 @@ def sb_norwegian(word): >>> sb_norwegian('sikkerhet') 'sikker' + .. versionadded:: 0.1.0 + """ return SnowballNorwegian().stem(word) diff --git a/abydos/stemmer/_snowball_swedish.py b/abydos/stemmer/_snowball_swedish.py index 2f4f0d31e..41da1bf53 100644 --- a/abydos/stemmer/_snowball_swedish.py +++ b/abydos/stemmer/_snowball_swedish.py @@ -30,9 +30,12 @@ from unicodedata import normalize +from deprecation import deprecated + from six import text_type from ._snowball import _Snowball +from .. import __version__ __all__ = ['SnowballSwedish', 'sb_swedish'] @@ -42,6 +45,8 @@ class SnowballSwedish(_Snowball): The Snowball Swedish stemmer is defined at: http://snowball.tartarus.org/algorithms/swedish/stemmer.html + + .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö'} @@ -88,6 +93,11 @@ def stem(self, word): >>> stmr.stem('visshet') 'viss' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) @@ -152,6 +162,12 @@ def stem(self, word): return word +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the SnowballSwedish.stem method instead.', +) def sb_swedish(word): """Return Snowball Swedish stem. @@ -176,6 +192,8 @@ def sb_swedish(word): >>> sb_swedish('visshet') 'viss' + .. versionadded:: 0.1.0 + """ return SnowballSwedish().stem(word) diff --git a/abydos/stemmer/_stemmer.py b/abydos/stemmer/_stemmer.py index 3ce255516..b052fb265 100644 --- a/abydos/stemmer/_stemmer.py +++ b/abydos/stemmer/_stemmer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -28,11 +28,16 @@ unicode_literals, ) +__all__ = ['_Stemmer'] + class _Stemmer(object): - """Abstract Stemmer class.""" + """Abstract Stemmer class. + + .. versionadded:: 0.3.6 + """ - def stem(self, word, *args, **kwargs): + def stem(self, word): """Return stem. Parameters @@ -49,6 +54,9 @@ def stem(self, word, *args, **kwargs): str Word stem + + .. versionadded:: 0.3.6 + """ pass diff --git a/abydos/stemmer/_uea_lite.py b/abydos/stemmer/_uea_lite.py index 5195fe3d5..c481a1abb 100644 --- a/abydos/stemmer/_uea_lite.py +++ b/abydos/stemmer/_uea_lite.py @@ -30,9 +30,12 @@ from re import match as re_match +from deprecation import deprecated + from six.moves import range from ._stemmer import _Stemmer +from .. import __version__ __all__ = ['UEALite', 'uealite'] @@ -48,6 +51,8 @@ class UEALite(_Stemmer): Java version: :cite:`Churchill:2005` Perl version: :cite:`Jenkins:2005` Ruby version: :cite:`Adams:2017` + + .. versionadded:: 0.3.6 """ _problem_words = {'is', 'as', 'this', 'has', 'was', 'during'} @@ -617,20 +622,17 @@ class UEALite(_Stemmer): 'Perl': _perl_rule_table, } - def stem( + def __init__( self, - word, max_word_length=20, max_acro_length=8, return_rule_no=False, var='standard', ): - """Return UEA-Lite stem. + """Initialize UEALite instance. Parameters ---------- - word : str - The word to stem max_word_length : int The maximum word length allowed max_acro_length : int @@ -643,6 +645,23 @@ def stem( - ``Adams`` to use Jason Adams' rules - ``Perl`` to use the original Perl rules + + .. versionadded:: 0.4.0 + + """ + self._max_word_length = max_word_length + self._max_acro_length = max_acro_length + self._return_rule_no = return_rule_no + self._var = var + + def stem(self, word): + """Return UEA-Lite stem. + + Parameters + ---------- + word : str + The word to stem + Returns ------- str or (str, int) @@ -661,6 +680,11 @@ def stem( >>> uealite('eroded') 'erode' + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.3.6 + Encapsulated in class + """ def _stem_with_duplicate_character_check(word, del_len): @@ -678,10 +702,10 @@ def _stem(word): if not word: return word, 0 if word in self._problem_words or ( - word == 'menses' and var == 'Adams' + word == 'menses' and self._var == 'Adams' ): return word, 90 - if max_word_length and len(word) > max_word_length: + if self._max_word_length and len(word) > self._max_word_length: return word, 95 if "'" in word: @@ -710,25 +734,33 @@ def _stem(word): elif '_' in word: return word, 90 elif word[-1] == 's' and word[:-1].isupper(): - if var == 'Adams' and len(word) - 1 > max_acro_length: + if ( + self._var == 'Adams' + and len(word) - 1 > self._max_acro_length + ): return word, 96 return word[:-1], 91.1 elif word.isupper(): - if var == 'Adams' and len(word) > max_acro_length: + if ( + self._var == 'Adams' + and len(word) > self._max_acro_length + ): return word, 96 return word, 91 elif re_match(r'^.*[A-Z].*[A-Z].*$', word): return word, 92 elif word[0].isupper(): return word, 93 - elif var == 'Adams' and re_match( + elif self._var == 'Adams' and re_match( r'^[a-z](|[rl])(ing|ed)$', word ): return word, 97 for n in range(7, 1, -1): - if word[-n:] in self._rules[var][n]: - rule_no, del_len, add_str = self._rules[var][n][word[-n:]] + if word[-n:] in self._rules[self._var][n]: + rule_no, del_len, add_str = self._rules[self._var][n][ + word[-n:] + ] if del_len: stemmed_word = word[:-del_len] else: @@ -755,11 +787,17 @@ def _stem(word): return stemmed_word, rule_no stem, rule_no = _stem(word) - if return_rule_no: + if self._return_rule_no: return stem, rule_no return stem +@deprecated( + deprecated_in='0.4.0', + removed_in='0.6.0', + current_version=__version__, + details='Use the UEALite.stem method instead.', +) def uealite( word, max_word_length=20, @@ -805,9 +843,11 @@ def uealite( >>> uealite('eroded') 'erode' + .. versionadded:: 0.1.0 + """ - return UEALite().stem( - word, max_word_length, max_acro_length, return_rule_no, var + return UEALite(max_word_length, max_acro_length, return_rule_no, var).stem( + word ) diff --git a/abydos/tokenizer/__init__.py b/abydos/tokenizer/__init__.py index 5fb790c32..32f984d21 100644 --- a/abydos/tokenizer/__init__.py +++ b/abydos/tokenizer/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -16,32 +16,78 @@ # You should have received a copy of the GNU General Public License # along with Abydos. If not, see . -"""abydos.tokenizer. +r"""abydos.tokenizer. The tokenizer package collects classes whose purpose is to tokenize -text. Currently, this is limited to the :py:class:`.QGrams` class, which -tokenizes a string into q-grams. The class supports different values of -q, the addition of start and stop symbols, and skip values. It even supports -multiple values for q and skip, using lists or ranges. - ->>> QGrams('interning', qval=2, start_stop='$#') -QGrams({'in': 2, '$i': 1, 'nt': 1, 'te': 1, 'er': 1, 'rn': 1, 'ni': 1, 'ng': 1, - 'g#': 1}) - ->>> QGrams('AACTAGAAC', start_stop='', skip=1) -QGrams({'AC': 2, 'AT': 1, 'CA': 1, 'TG': 1, 'AA': 1, 'GA': 1, 'A': 1}) - ->>> QGrams('AACTAGAAC', start_stop='', skip=[0, 1]) -QGrams({'AC': 4, 'AA': 3, 'GA': 2, 'CT': 1, 'TA': 1, 'AG': 1, 'AT': 1, 'CA': 1, - 'TG': 1, 'A': 1}) - ->>> QGrams('interdisciplinarian', qval=range(3), skip=[0, 1]) -QGrams({'i': 10, 'n': 7, 'r': 4, 'a': 4, 'in': 3, 't': 2, 'e': 2, 'd': 2, - 's': 2, 'c': 2, 'p': 2, 'l': 2, 'ri': 2, 'ia': 2, '$i': 1, 'nt': 1, 'te': 1, - 'er': 1, 'rd': 1, 'di': 1, 'is': 1, 'sc': 1, 'ci': 1, 'ip': 1, 'pl': 1, - 'li': 1, 'na': 1, 'ar': 1, 'an': 1, 'n#': 1, '$n': 1, 'it': 1, 'ne': 1, - 'tr': 1, 'ed': 1, 'ds': 1, 'ic': 1, 'si': 1, 'cp': 1, 'il': 1, 'pi': 1, - 'ln': 1, 'nr': 1, 'ai': 1, 'ra': 1, 'a#': 1}) +text or individual words. Each tokenizer also supports a scaler attribute when +constructed, which adjusts count scaling. The scaler defaults to None, +which performs no scaling. Setting scaler to 'set' is used to convert token +counters from multi-sets to sets, so even if multiple instances of a token are +present, they will be counted as one. Additionally, a callable function (of one +argument, such as log, exp, or lambda x: x + 1) may be passed to scaler and +this function will be applied to each count value. + +The following general tokenizers are provided: + + - :py:class:`.QGrams` tokenizes a string into q-grams, substrings of length + q. The class supports different values of q, the addition of start and + stop symbols, and skip values. It even supports multiple values for q and + skip, using lists or ranges. + + - :py:class:`.QSkipgrams` tokenizes a string into skipgrams of length q. A + skipgram is a sequence of letters from a string with q, often + discontinuous, characters. For example, the string 'ABCD' has the + following 2-skipgrams: 'AB', 'AC', 'AD', 'BC', 'BD', 'CD'. + + - :py:class:`.CharacterTokenizer` tokenizes a string into individual + characters. + + - :py:class:`.RegexpTokenizer` tokenizes a string according to a supplied + regular expression. + + - :py:class:`.WhitespaceTokenizer` tokenizes a string by dividing it at + instances of whitespace. + + - :py:class:`.WordpunctTokenizer` tokenizes a string by dividing it into + strings of letters and strings of punctuation. + +Six syllable-oriented tokenizers are provided: + + - :py:class:`.COrVClusterTokenizer` tokenizes a string by dividing it into + strings of consonants, vowels, or other characters: + + - :py:class:`.COrVClusterTokenizer` tokenizes a string by dividing it into + strings of consonants (C* clusters), vowels (V* clusters, or non-letter + characters: + + - :py:class:`.CVClusterTokenizer` tokenizes a string by dividing it into + strings of consonants then vowels (C*V* clusters) or non-letter + characters: + + - :py:class:`.VCClusterTokenizer` tokenizes a string by dividing it into + strings of vowels then characters (V*C* clusters) or non-letter + characters: + + - :py:class:`.SAPSTokenizer` tokenizes a string according to the rules + specified by the SAPS syllabification algorithm :cite:`Ruibin:2005`: + + - :py:class:`.SonoriPyTokenizer` does syllabification according to the + sonority sequencing principle, using SyllabiPy. It requires that + SyllabiPy_ be installed. + + - :py:class:`.LegaliPyTokenizer` does syllabification according to the + onset maximization principle (principle of legality), using SyllabiPy. + It requires that SyllabiPy_ be installed, and works best if it has been + trained on a corpus of text. + +Finally, an NLTK tokenizer is provided: + + - :py:class:`.NLTKTokenizer` does tokenization using an instantiated NLTK + tokenizer. Accordingly, NLTK_ needs to be installed. + +.. _SyllabiPy: https://pypi.org/project/syllabipy/ +.. _NLTK: https://www.nltk.org/ + ---- @@ -54,9 +100,37 @@ unicode_literals, ) -from ._qgrams import QGrams +from ._c_or_v_cluster import COrVClusterTokenizer +from ._character import CharacterTokenizer +from ._cv_cluster import CVClusterTokenizer +from ._legalipy import LegaliPyTokenizer +from ._nltk import NLTKTokenizer +from ._q_grams import QGrams +from ._q_skipgrams import QSkipgrams +from ._regexp import RegexpTokenizer +from ._saps import SAPSTokenizer +from ._sonoripy import SonoriPyTokenizer +from ._tokenizer import _Tokenizer +from ._vc_cluster import VCClusterTokenizer +from ._whitespace import WhitespaceTokenizer +from ._wordpunct import WordpunctTokenizer -__all__ = ['QGrams'] +__all__ = [ + '_Tokenizer', + 'QGrams', + 'QSkipgrams', + 'CharacterTokenizer', + 'RegexpTokenizer', + 'WhitespaceTokenizer', + 'WordpunctTokenizer', + 'COrVClusterTokenizer', + 'CVClusterTokenizer', + 'VCClusterTokenizer', + 'SAPSTokenizer', + 'SonoriPyTokenizer', + 'LegaliPyTokenizer', + 'NLTKTokenizer', +] if __name__ == '__main__': diff --git a/abydos/tokenizer/_c_or_v_cluster.py b/abydos/tokenizer/_c_or_v_cluster.py new file mode 100644 index 000000000..f7c1dc60b --- /dev/null +++ b/abydos/tokenizer/_c_or_v_cluster.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._c_or_v_cluster. + +Consonant or vowel cluster tokenizer. + +This tokenizer first performs wordpunct tokenization, so words are split into +separate units and non-letter characters are added as their own units. +Following this, words are further divided into strings of consonants only and +strings of vowels only. +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import re +import unicodedata + +from ._tokenizer import _Tokenizer + +__all__ = ['COrVClusterTokenizer'] + + +class COrVClusterTokenizer(_Tokenizer): + """A C- or V-cluster tokenizer. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None, consonants=None, vowels=None): + """Initialize tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(COrVClusterTokenizer, self).__init__(scaler=scaler) + if consonants: + self._consonants = consonants + else: + self._consonants = set('bcdfghjklmnpqrstvwxzßBCDFGHJKLMNPQRSTVWXZ') + if vowels: + self._vowels = vowels + else: + self._vowels = set('aeiouyAEIOUY') + self._regexp = re.compile(r'\w+|[^\w\s]+', flags=0) + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> COrVClusterTokenizer().tokenize('seven-twelfths') + COrVClusterTokenizer({'e': 3, 's': 1, 'v': 1, 'n': 1, '-': 1, + 'tw': 1, 'lfths': 1}) + + >>> COrVClusterTokenizer().tokenize('character') + COrVClusterTokenizer({'a': 2, 'r': 2, 'ch': 1, 'ct': 1, 'e': 1}) + + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = [] + token_list = self._regexp.findall(self._string) + for token in token_list: + if ( + token[0] not in self._consonants + and token[0] not in self._vowels + ): + self._ordered_tokens.append(token) + else: + token = unicodedata.normalize('NFD', token) + mode = 0 # 0 = starting mode, 1 = cons, 2 = vowels + new_token = '' # noqa: S105 + for char in token: + if char in self._consonants: + if mode == 2: + self._ordered_tokens.append(new_token) + new_token = char + else: + new_token += char + mode = 1 + elif char in self._vowels: + if mode == 1: + self._ordered_tokens.append(new_token) + new_token = char + else: + new_token += char + mode = 2 + else: # This should cover combining marks, marks, etc. + new_token += char + + self._ordered_tokens.append(new_token) + + self._ordered_tokens = [ + unicodedata.normalize('NFC', token) + for token in self._ordered_tokens + ] + super(COrVClusterTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/tokenizer/_character.py b/abydos/tokenizer/_character.py new file mode 100644 index 000000000..b8f18f934 --- /dev/null +++ b/abydos/tokenizer/_character.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._character. + +Character tokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._tokenizer import _Tokenizer + +__all__ = ['CharacterTokenizer'] + + +class CharacterTokenizer(_Tokenizer): + """A character tokenizer. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None): + """Initialize tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(CharacterTokenizer, self).__init__(scaler) + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> CharacterTokenizer().tokenize('AACTAGAAC') + CharacterTokenizer({'A': 5, 'C': 2, 'T': 1, 'G': 1}) + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = list(string) + + super(CharacterTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/tokenizer/_cv_cluster.py b/abydos/tokenizer/_cv_cluster.py new file mode 100644 index 000000000..e3c6e4fe2 --- /dev/null +++ b/abydos/tokenizer/_cv_cluster.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._cv_cluster. + +CV cluster tokenizer. + +This tokenizer first performs wordpunct tokenization, so words are split into +separate units and non-letter characters are added as their own units. +Following this, words are further divided into strings of consisting of +consonants then vowels (without limit of either). But, crucially, a vowel to +consonant transition marks the start of a new token. +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import re +import unicodedata + +from ._tokenizer import _Tokenizer + +__all__ = ['CVClusterTokenizer'] + + +class CVClusterTokenizer(_Tokenizer): + """A C*V*-cluster tokenizer. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None, consonants=None, vowels=None): + """Initialize tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(CVClusterTokenizer, self).__init__(scaler=scaler) + if consonants: + self._consonants = consonants + else: + self._consonants = set('bcdfghjklmnpqrstvwxzßBCDFGHJKLMNPQRSTVWXZ') + if vowels: + self._vowels = vowels + else: + self._vowels = set('aeiouyAEIOUY') + self._regexp = re.compile(r'\w+|[^\w\s]+', flags=0) + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> CVClusterTokenizer().tokenize('seven-twelfths') + CVClusterTokenizer({'se': 1, 've': 1, 'n': 1, '-': 1, 'twe': 1, + 'lfths': 1}) + + >>> CVClusterTokenizer().tokenize('character') + CVClusterTokenizer({'cha': 1, 'ra': 1, 'cte': 1, 'r': 1}) + + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = [] + token_list = self._regexp.findall(self._string) + for token in token_list: + if ( + token[0] not in self._consonants + and token[0] not in self._vowels + ): + self._ordered_tokens.append(token) + else: + token = unicodedata.normalize('NFD', token) + mode = 0 # 0 = starting mode, 1 = cons, 2 = vowels + new_token = '' # noqa: S105 + for char in token: + if char in self._consonants: + if mode == 2: + self._ordered_tokens.append(new_token) + new_token = char + else: + new_token += char + mode = 1 + elif char in self._vowels: + new_token += char + mode = 2 + else: # This should cover combining marks, marks, etc. + new_token += char + + self._ordered_tokens.append(new_token) + + self._ordered_tokens = [ + unicodedata.normalize('NFC', token) + for token in self._ordered_tokens + ] + super(CVClusterTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/tokenizer/_legalipy.py b/abydos/tokenizer/_legalipy.py new file mode 100644 index 000000000..c3f98b115 --- /dev/null +++ b/abydos/tokenizer/_legalipy.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._legalipy. + +LegaliPy tokenizer class +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._tokenizer import _Tokenizer + +try: + from syllabipy.legalipy import LegaliPy + from syllabipy.legalipy import getOnsets as gen_onsets # noqa: N813 +except ImportError: # pragma: no cover + # If the system lacks the SyllabiPy library, that's fine, but SyllabiPy + # tokenization won't be supported. + gen_onsets = None + LegaliPy = None + + +class LegaliPyTokenizer(_Tokenizer): + """LegaliPy tokenizer. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None): + """Initialize Tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + if LegaliPy is None: + raise TypeError( # pragma: no cover + 'LegaliPy tokenizer requires installation of SyllabiPy' + + ' package.' + ) + + super(LegaliPyTokenizer, self).__init__(scaler) + + self._onsets = [''] + + def train_onsets(self, text, threshold=0.0002, clean=True, append=False): + """Train the onsets on a text. + + Parameters + ---------- + text : str + The text on which to train + threshold : float + Threshold proportion above which to include onset into onset list + clean : bool + If True, the text is stripped of numerals and punctuation + append : bool + If True, the current onset list is extended + + + .. versionadded:: 0.4.0 + + """ + new_onsets = gen_onsets(text, threshold, clean) + if append: + self._onsets = list(set(self._onsets + new_onsets)) + else: + self._onsets = new_onsets + + def tokenize(self, string, ipa=False): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + ipa : bool + If True, indicates that the string is in IPA + + Examples + -------- + >>> LegaliPyTokenizer().tokenize('seven-twelfths') + LegaliPyTokenizer({'s': 1, 'ev': 1, 'en-tw': 1, 'elfths': 1}) + + >>> LegaliPyTokenizer().tokenize('character') + LegaliPyTokenizer({'ch': 1, 'ar': 1, 'act': 1, 'er': 1}) + + .. versionadded:: 0.4.0 + + """ + self._string = string + + self._ordered_tokens = [] + for word in string.split(): + self._ordered_tokens += LegaliPy(word, self._onsets) + + super(LegaliPyTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/tokenizer/_nltk.py b/abydos/tokenizer/_nltk.py new file mode 100644 index 000000000..998d99516 --- /dev/null +++ b/abydos/tokenizer/_nltk.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._nltk. + +NLTK tokenizer wrapper class +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._tokenizer import _Tokenizer + + +class NLTKTokenizer(_Tokenizer): + """NLTK tokenizer wrapper class. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, nltk_tokenizer=None, scaler=None): + """Initialize Tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + nltk_tokenizer : Object + An instantiated tokenizer from NLTK. + + + .. versionadded:: 0.4.0 + + """ + super(NLTKTokenizer, self).__init__(scaler) + + if 'nltk.tokenize' in str(type(nltk_tokenizer)) and hasattr( + nltk_tokenizer, 'tokenize' + ): + self.nltk_tokenizer = nltk_tokenizer + else: + raise TypeError( + 'nltk_tokenizer must be an initialized tokenizer from the' + + ' NLTK package (e.g. TweetTokenizer()).' + ) + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> from nltk.tokenize.casual import TweetTokenizer + >>> nltk_tok = TweetTokenizer() + >>> NLTKTokenizer(nltk_tokenizer=nltk_tok).tokenize( + ... '.@Twitter Today is #lit!') + NLTKTokenizer({'.': 1, '@Twitter': 1, 'Today': 1, 'is': 1, '#lit': 1, + '!': 1}) + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = self.nltk_tokenizer.tokenize(string) + super(NLTKTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/tokenizer/_q_grams.py b/abydos/tokenizer/_q_grams.py new file mode 100644 index 000000000..e866f7263 --- /dev/null +++ b/abydos/tokenizer/_q_grams.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._q_grams. + +QGrams multi-set class +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import Iterable + +from six.moves import range + +from ._tokenizer import _Tokenizer + +__all__ = ['QGrams'] + + +class QGrams(_Tokenizer): + """A q-gram class, which functions like a bag/multiset. + + A q-gram is here defined as all sequences of q characters. Q-grams are also + known as k-grams and n-grams, but the term n-gram more typically refers to + sequences of whitespace-delimited words in a string, where q-gram refers + to sequences of characters in a word or string. + + .. versionadded:: 0.1.0 + """ + + def __init__(self, qval=2, start_stop='$#', skip=0, scaler=None): + """Initialize QGrams. + + Parameters + ---------- + qval : int or Iterable + The q-gram length (defaults to 2), can be an integer, range object, + or list + start_stop : str + A string of length >= 0 indicating start & stop symbols. + If the string is '', q-grams will be calculated without start & + stop symbols appended to each end. + Otherwise, the first character of start_stop will pad the + beginning of the string and the last character of start_stop + will pad the end of the string before q-grams are calculated. + (In the case that start_stop is only 1 character long, the same + symbol will be used for both.) + skip : int or Iterable + The number of characters to skip, can be an integer, range object, + or list + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + Raises + ------ + ValueError + Use WhitespaceTokenizer instead of qval=0. + + Examples + -------- + >>> qg = QGrams().tokenize('AATTATAT') + >>> qg + QGrams({'AT': 3, 'TA': 2, '$A': 1, 'AA': 1, 'TT': 1, 'T#': 1}) + + >>> qg = QGrams(qval=1, start_stop='').tokenize('AATTATAT') + >>> qg + QGrams({'A': 4, 'T': 4}) + + >>> qg = QGrams(qval=3, start_stop='').tokenize('AATTATAT') + >>> qg + QGrams({'TAT': 2, 'AAT': 1, 'ATT': 1, 'TTA': 1, 'ATA': 1}) + + >>> QGrams(qval=2, start_stop='$#').tokenize('interning') + QGrams({'in': 2, '$i': 1, 'nt': 1, 'te': 1, 'er': 1, 'rn': 1, + 'ni': 1, 'ng': 1, 'g#': 1}) + + >>> QGrams(start_stop='', skip=1).tokenize('AACTAGAAC') + QGrams({'AC': 2, 'AT': 1, 'CA': 1, 'TG': 1, 'AA': 1, 'GA': 1, 'A': 1}) + + >>> QGrams(start_stop='', skip=[0, 1]).tokenize('AACTAGAAC') + QGrams({'AC': 4, 'AA': 3, 'GA': 2, 'CT': 1, 'TA': 1, 'AG': 1, + 'AT': 1, 'CA': 1, 'TG': 1, 'A': 1}) + + >>> QGrams(qval=range(3), skip=[0, 1]).tokenize('interdisciplinarian') + QGrams({'i': 10, 'n': 7, 'r': 4, 'a': 4, 'in': 3, 't': 2, 'e': 2, + 'd': 2, 's': 2, 'c': 2, 'p': 2, 'l': 2, 'ri': 2, 'ia': 2, '$i': 1, + 'nt': 1, 'te': 1, 'er': 1, 'rd': 1, 'di': 1, 'is': 1, 'sc': 1, 'ci': 1, + 'ip': 1, 'pl': 1, 'li': 1, 'na': 1, 'ar': 1, 'an': 1, 'n#': 1, '$n': 1, + 'it': 1, 'ne': 1, 'tr': 1, 'ed': 1, 'ds': 1, 'ic': 1, 'si': 1, 'cp': 1, + 'il': 1, 'pi': 1, 'ln': 1, 'nr': 1, 'ai': 1, 'ra': 1, 'a#': 1}) + + .. versionadded:: 0.1.0 + .. versionchanged:: 0.4.0 + Broke tokenization functions out into tokenize method + + """ + if qval == 0: + raise ValueError('Use WhitespaceTokenizer instead of qval=0.') + super(QGrams, self).__init__(scaler) + + # Save parameters + self.qval = qval + self.start_stop = start_stop + if qval == 1: + self.start_stop = '' + self.skip = skip + + self._string_ss = self._string + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = [] + + if not isinstance(self.qval, Iterable): + self.qval = (self.qval,) + if not isinstance(self.skip, Iterable): + self.skip = (self.skip,) + + if string: + for qval_i in self.qval: + for skip_i in self.skip: + if qval_i < 1: + continue + + if self.start_stop: + string = ( + self.start_stop[0] * (qval_i - 1) + + self._string + + self.start_stop[-1] * (qval_i - 1) + ) + else: + string = self._string + + if qval_i > 1 and len(string) < qval_i: + continue + + # Having appended start & stop symbols (or not), save the + # result, but only for the longest valid qval_i + if len(string) > len(self._string_ss): + self._string_ss = string + + skip_i += 1 + self._ordered_tokens += [ + string[i : i + (qval_i * skip_i) : skip_i] + for i in range(len(string) - (qval_i - 1)) + ] + + super(QGrams, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/tokenizer/_q_skipgrams.py b/abydos/tokenizer/_q_skipgrams.py new file mode 100644 index 000000000..dcaad4252 --- /dev/null +++ b/abydos/tokenizer/_q_skipgrams.py @@ -0,0 +1,209 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._q_skipgrams. + +Q-Skipgrams multi-set class +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import Iterable +from itertools import combinations + +from ._tokenizer import _Tokenizer + +__all__ = ['QSkipgrams'] + + +class QSkipgrams(_Tokenizer): + """A q-skipgram class, which functions like a bag/multiset. + + A q-gram is here defined as all sequences of q characters. Q-grams are also + known as k-grams and n-grams, but the term n-gram more typically refers to + sequences of whitespace-delimited words in a string, where q-gram refers + to sequences of characters in a word or string. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, qval=2, start_stop='$#', scaler=None, lambda_val=0.9): + """Initialize QSkipgrams. + + Parameters + ---------- + qval : int or Iterable + The q-gram length (defaults to 2), can be an integer, range object, + or list + start_stop : str + A string of length >= 0 indicating start & stop symbols. + If the string is '', q-grams will be calculated without start & + stop symbols appended to each end. + Otherwise, the first character of start_stop will pad the + beginning of the string and the last character of start_stop + will pad the end of the string before q-grams are calculated. + (In the case that start_stop is only 1 character long, the same + symbol will be used for both.) + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + - 'SSK' : Applies weighting according to the substring kernel + rules of :cite:`Lodhi:2002`. + lambda_val : float + A value in the range (0.0, 1.0) used for discouting gaps between + characters according to the method described in :cite:`Lodhi:2002`. + + Raises + ------ + ValueError + Use WhitespaceTokenizer instead of qval=0. + + Examples + -------- + >>> QSkipgrams().tokenize('AATTAT') + QSkipgrams({'AT': 7, '$A': 3, '$T': 3, 'AA': 3, 'A#': 3, 'TT': 3, + 'T#': 3, 'TA': 2, '$#': 1}) + + >>> QSkipgrams(qval=1, start_stop='').tokenize('AATTAT') + QSkipgrams({'A': 3, 'T': 3}) + + >>> QSkipgrams(qval=3, start_stop='').tokenize('AATTAT') + QSkipgrams({'ATT': 6, 'AAT': 5, 'ATA': 4, 'TAT': 2, 'AAA': 1, + 'TTA': 1, 'TTT': 1}) + + >>> QSkipgrams(start_stop='').tokenize('ABCD') + QSkipgrams({'AB': 1, 'AC': 1, 'AD': 1, 'BC': 1, 'BD': 1, 'CD': 1}) + + >>> QSkipgrams().tokenize('Colin') + QSkipgrams({'$C': 1, '$o': 1, '$l': 1, '$i': 1, '$n': 1, '$#': 1, + 'Co': 1, 'Cl': 1, 'Ci': 1, 'Cn': 1, 'C#': 1, 'ol': 1, 'oi': 1, 'on': 1, + 'o#': 1, 'li': 1, 'ln': 1, 'l#': 1, 'in': 1, 'i#': 1, 'n#': 1}) + + >>> QSkipgrams(qval=3).tokenize('AACTAGAAC') + QSkipgrams({'$AA': 20, '$A#': 20, 'AA#': 20, '$AC': 14, 'AC#': 14, + 'AAC': 11, 'AAA': 10, '$C#': 8, '$AG': 6, '$CA': 6, '$TA': 6, 'ACA': 6, + 'ATA': 6, 'AGA': 6, 'AG#': 6, 'CA#': 6, 'TA#': 6, '$$A': 5, 'A##': 5, + '$AT': 4, '$T#': 4, '$GA': 4, '$G#': 4, 'AT#': 4, 'GA#': 4, 'AAG': 3, + 'AGC': 3, 'CTA': 3, 'CAA': 3, 'CAC': 3, 'TAA': 3, 'TAC': 3, '$$C': 2, + '$$#': 2, '$CT': 2, '$CG': 2, '$CC': 2, '$TG': 2, '$TC': 2, '$GC': 2, + '$##': 2, 'ACT': 2, 'ACG': 2, 'ACC': 2, 'ATG': 2, 'ATC': 2, 'CT#': 2, + 'CGA': 2, 'CG#': 2, 'CC#': 2, 'C##': 2, 'TGA': 2, 'TG#': 2, 'TC#': 2, + 'GAC': 2, 'GC#': 2, '$$T': 1, '$$G': 1, 'AAT': 1, 'CTG': 1, 'CTC': 1, + 'CAG': 1, 'CGC': 1, 'TAG': 1, 'TGC': 1, 'T##': 1, 'GAA': 1, 'G##': 1}) + + QSkipgrams may also be used to produce weights in accordance with the + substring kernel rules of :cite:`Lodhi:2002` by passing the scaler + value ``'SSK'``: + + >>> QSkipgrams(scaler='SSK').tokenize('AACTAGAAC') + QSkipgrams({'AA': 6.170192010000001, 'AC': 4.486377699, + '$A': 2.8883286990000006, 'A#': 2.6526399291000002, 'TA': 2.05659, + 'AG': 1.931931, 'CA': 1.850931, 'GA': 1.5390000000000001, 'AT': 1.3851, + 'C#': 1.2404672100000003, '$C': 1.0047784401000002, 'CT': 0.81, + 'TG': 0.7290000000000001, 'CG': 0.6561, 'GC': 0.6561, + '$T': 0.5904900000000001, 'G#': 0.5904900000000001, 'TC': 0.531441, + '$G': 0.4782969000000001, 'CC': 0.4782969000000001, + 'T#': 0.4782969000000001, '$#': 0.31381059609000006}) + + .. versionadded:: 0.4.0 + + """ + super(QSkipgrams, self).__init__(scaler) + + # Save parameters + self.qval = qval + self.start_stop = start_stop + if qval == 1: + self.start_stop = '' + + self._string_ss = self._string + self._lambda = lambda_val + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = [] + self._ordered_weights = [] + + if not isinstance(self.qval, Iterable): + self.qval = (self.qval,) + + for qval_i in self.qval: + if qval_i < 1: + continue + + if self.start_stop and self._string: + string = ( + self.start_stop[0] * (qval_i - 1) + + self._string + + self.start_stop[-1] * (qval_i - 1) + ) + else: + string = self._string + + if len(string) < qval_i: + continue + + # Having appended start & stop symbols (or not), save the + # result, but only for the longest valid qval_i + if len(string) > len(self._string_ss): + self._string_ss = string + + combs = list(combinations(enumerate(string), qval_i)) + self._ordered_tokens += [''.join(l[1] for l in t) for t in combs] + + if self._scaler == 'SSK': + self._ordered_weights += [ + self._lambda ** (t[-1][0] - t[0][0] + len(t) - 1) + for t in combs + ] + else: + self._ordered_weights += [1] * len(combs) + + super(QSkipgrams, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/tokenizer/_qgrams.py b/abydos/tokenizer/_qgrams.py deleted file mode 100644 index 19a6f5fca..000000000 --- a/abydos/tokenizer/_qgrams.py +++ /dev/null @@ -1,151 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2014-2018 by Christopher C. Little. -# This file is part of Abydos. -# -# Abydos is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Abydos is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Abydos. If not, see . - -"""abydos.tokenizer._q_grams. - -QGrams multi-set class -""" - -from __future__ import ( - absolute_import, - division, - print_function, - unicode_literals, -) - -from collections import Counter, Iterable - -from six.moves import range - -__all__ = ['QGrams'] - - -class QGrams(Counter): - """A q-gram class, which functions like a bag/multiset. - - A q-gram is here defined as all sequences of q characters. Q-grams are also - known as k-grams and n-grams, but the term n-gram more typically refers to - sequences of whitespace-delimited words in a string, where q-gram refers - to sequences of characters in a word or string. - """ - - def __init__(self, term, qval=2, start_stop='$#', skip=0): - """Initialize QGrams. - - Parameters - ---------- - term : str - A string to extract q-grams from - qval : int or Iterable - The q-gram length (defaults to 2), can be an integer, range object, - or list - start_stop : str - A string of length >= 0 indicating start & stop symbols. - If the string is '', q-grams will be calculated without start & - stop symbols appended to each end. - Otherwise, the first character of start_stop will pad the - beginning of the string and the last character of start_stop - will pad the end of the string before q-grams are calculated. - (In the case that start_stop is only 1 character long, the same - symbol will be used for both.) - skip : int or Iterable - The number of characters to skip, can be an integer, range object, - or list - - Examples - -------- - >>> qg = QGrams('AATTATAT') - >>> qg - QGrams({'AT': 3, 'TA': 2, '$A': 1, 'AA': 1, 'TT': 1, 'T#': 1}) - - >>> qg = QGrams('AATTATAT', qval=1, start_stop='') - >>> qg - QGrams({'A': 4, 'T': 4}) - - >>> qg = QGrams('AATTATAT', qval=3, start_stop='') - >>> qg - QGrams({'TAT': 2, 'AAT': 1, 'ATT': 1, 'TTA': 1, 'ATA': 1}) - - """ - # Save the term itself - self._term = term - self._term_ss = term - self._ordered_list = [] - - if not isinstance(qval, Iterable): - qval = (qval,) - if not isinstance(skip, Iterable): - skip = (skip,) - - for qval_i in qval: - for skip_i in skip: - if len(self._term) < qval_i or qval_i < 1: - continue - - if start_stop and qval_i > 1: - term = ( - start_stop[0] * (qval_i - 1) - + self._term - + start_stop[-1] * (qval_i - 1) - ) - else: - term = self._term - - # Having appended start & stop symbols (or not), save the - # result, but only for the longest valid qval_i - if len(term) > len(self._term_ss): - self._term_ss = term - - skip_i += 1 - self._ordered_list += [ - term[i : i + (qval_i * skip_i) : skip_i] - for i in range(len(term) - (qval_i - 1)) - ] - - super(QGrams, self).__init__(self._ordered_list) - - def count(self): - """Return q-grams count. - - Returns - ------- - int - The total count of q-grams in a QGrams object - - Examples - -------- - >>> qg = QGrams('AATTATAT') - >>> qg.count() - 9 - - >>> qg = QGrams('AATTATAT', qval=1, start_stop='') - >>> qg.count() - 8 - - >>> qg = QGrams('AATTATAT', qval=3, start_stop='') - >>> qg.count() - 6 - - """ - return sum(self.values()) - - -if __name__ == '__main__': - import doctest - - doctest.testmod() diff --git a/abydos/tokenizer/_regexp.py b/abydos/tokenizer/_regexp.py new file mode 100644 index 000000000..5c7a0e17a --- /dev/null +++ b/abydos/tokenizer/_regexp.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._wordpunct. + +Regexp tokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import re + +from ._tokenizer import _Tokenizer + +__all__ = ['RegexpTokenizer'] + + +class RegexpTokenizer(_Tokenizer): + """A regexp tokenizer. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None, regexp=r'\w+', flags=0): + """Initialize tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(RegexpTokenizer, self).__init__(scaler) + + # Save parameters + self._regexp = re.compile(regexp, flags) + + self._string = '' + self._ordered_tokens = [] + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> RegexpTokenizer(regexp=r'[^-]+').tokenize('AA-CT-AG-AA-CD') + RegexpTokenizer({'AA': 2, 'CT': 1, 'AG': 1, 'CD': 1}) + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = self._regexp.findall(self._string) + super(RegexpTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/tokenizer/_saps.py b/abydos/tokenizer/_saps.py new file mode 100644 index 000000000..a3b9c8a0c --- /dev/null +++ b/abydos/tokenizer/_saps.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._saps. + +SAPS class +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._tokenizer import _Tokenizer + + +class SAPSTokenizer(_Tokenizer): + """Syllable Alignment Pattern Searching tokenizer. + + This is the syllabifier described on p. 917 of :cite:`Ruibin:2005`. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None): + """Initialize Tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(SAPSTokenizer, self).__init__(scaler) + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> SAPSTokenizer().tokenize('seven-twelfths') + SAPSTokenizer({'t': 2, 'se': 1, 'ven': 1, '-': 1, 'wel': 1, 'f': 1, + 'h': 1, 's': 1}) + + >>> SAPSTokenizer().tokenize('character') + SAPSTokenizer({'c': 1, 'ha': 1, 'rac': 1, 'ter': 1}) + + + .. versionadded:: 0.4.0 + + """ + self._string = string + + self._ordered_tokens = [] + + _vowels = set('aeiouyAEIOUY') + + words = self._string.split() + for w in words: + self._ordered_tokens = [] + i = 0 + while i < len(w): + syll = w[i : i + 1] + i += 1 + while w[i : i + 1] in _vowels: + syll += w[i : i + 1] + i += 1 + if syll[-1] in _vowels and ( + ( + len(w[i:]) > 1 + and w[i : i + 1] not in _vowels + and w[i + 1 : i + 2] not in _vowels + ) + or (len(w[i:]) == 1 and w[i : i + 1] not in _vowels) + ): + syll += w[i : i + 1] + i += 1 + self._ordered_tokens.append(syll) + + super(SAPSTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/tokenizer/_sonoripy.py b/abydos/tokenizer/_sonoripy.py new file mode 100644 index 000000000..6ac3f3a50 --- /dev/null +++ b/abydos/tokenizer/_sonoripy.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._sonoripy. + +SonoriPy class +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._tokenizer import _Tokenizer + +try: + from syllabipy.sonoripy import SonoriPy +except ImportError: # pragma: no cover + # If the system lacks the SyllabiPy library, that's fine, but SyllabiPy + # tokenization won't be supported. + SonoriPy = None + + +class SonoriPyTokenizer(_Tokenizer): + """SonoriPy tokenizer. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None): + """Initialize Tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + if SonoriPy is None: + raise TypeError( # pragma: no cover + 'SonoriPy tokenizer requires installation of SyllabiPy' + + ' package.' + ) + + super(SonoriPyTokenizer, self).__init__(scaler) + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> SonoriPyTokenizer().tokenize('seven-twelfths') + SonoriPyTokenizer({'se': 1, 'ven-': 1, 'twelfths': 1}) + + >>> SonoriPyTokenizer().tokenize('character') + SonoriPyTokenizer({'cha': 1, 'rac': 1, 'ter': 1}) + + + .. versionadded:: 0.4.0 + + """ + self._string = string + + self._ordered_tokens = [] + for word in string.split(): + self._ordered_tokens += SonoriPy(word) + + super(SonoriPyTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/tokenizer/_tokenizer.py b/abydos/tokenizer/_tokenizer.py new file mode 100644 index 000000000..fa56acced --- /dev/null +++ b/abydos/tokenizer/_tokenizer.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._tokenize. + +_Tokenizer base class +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from collections import Counter + +__all__ = ['_Tokenizer'] + + +class _Tokenizer(object): + """Abstract _Tokenizer class. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None, *args, **kwargs): + """Initialize Tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(_Tokenizer, self).__init__() + + self._scaler = scaler + self._tokens = Counter() + self._string = '' + self._ordered_tokens = [] + self._ordered_weights = [] + + def tokenize(self, string=None): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str or None + The string to tokenize + + + .. versionadded:: 0.4.0 + + """ + if string is not None: + self._string = string + self._ordered_tokens = [self._string] + self._ordered_weights = [1] + + if self._scaler in {'SSK'}: + for token, weight in zip( + self._ordered_tokens, self._ordered_weights + ): + self._tokens[token] += weight + else: + self._tokens = Counter(self._ordered_tokens) + + return self + + def count(self): + """Return token count. + + Returns + ------- + int + The total count of tokens + + Examples + -------- + >>> tok = _Tokenizer().tokenize('term') + >>> tok.count() + 1 + + + .. versionadded:: 0.4.0 + + """ + return sum(self.get_counter().values()) + + def count_unique(self): + """Return the number of unique elements. + + Returns + ------- + int + The number of unique tokens + + Examples + -------- + >>> tok = _Tokenizer().tokenize('term') + >>> tok.count_unique() + 1 + + + .. versionadded:: 0.4.0 + + """ + return len(self._tokens.values()) + + def get_counter(self): + """Return the tokens as a Counter object. + + Returns + ------- + Counter + The Counter of tokens + + Examples + -------- + >>> tok = _Tokenizer().tokenize('term') + >>> tok.get_counter() + Counter({'term': 1}) + + + .. versionadded:: 0.4.0 + + """ + if self._scaler == 'set': + return Counter({key: 1 for key in self._tokens.keys()}) + elif callable(self._scaler): + return Counter( + {key: self._scaler(val) for key, val in self._tokens.items()} + ) + else: + return self._tokens + + def get_set(self): + """Return the unique tokens as a set. + + Returns + ------- + Counter + The set of tokens + + Examples + -------- + >>> tok = _Tokenizer().tokenize('term') + >>> tok.get_set() + {'term'} + + + .. versionadded:: 0.4.0 + + """ + return set(self._tokens.keys()) + + def get_list(self): + """Return the tokens as an ordered list. + + Returns + ------- + Counter + The list of q-grams in the order they were added. + + Examples + -------- + >>> tok = _Tokenizer().tokenize('term') + >>> tok.get_list() + ['term'] + + + .. versionadded:: 0.4.0 + + """ + return self._ordered_tokens + + def __repr__(self): + """Return representation of tokens object. + + .. versionadded:: 0.4.0 + + """ + return self.__class__.__name__ + '{}'.format(str(self._tokens)[7:]) + + def __and__(self, other): + """Return intersection with other tokens. + + .. versionadded:: 0.4.0 + + """ + return self.get_counter() & other.get_counter() + + def __add__(self, other): + """Return union with other tokens. + + .. versionadded:: 0.4.0 + + """ + return self.get_counter() + other.get_counter() + + def __sub__(self, other): + """Return difference from other tokens. + + .. versionadded:: 0.4.0 + + """ + return self.get_counter() - other.get_counter() + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/tokenizer/_vc_cluster.py b/abydos/tokenizer/_vc_cluster.py new file mode 100644 index 000000000..a4ef8e844 --- /dev/null +++ b/abydos/tokenizer/_vc_cluster.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._vc_cluster. + +VC cluster tokenizer. + +This tokenizer first performs wordpunct tokenization, so words are split into +separate units and non-letter characters are added as their own units. +Following this, words are further divided into strings of consisting of +vowels then consonants (without limit of either). But, crucially, a consonant +to vowel transition marks the start of a new token. +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import re +import unicodedata + +from ._tokenizer import _Tokenizer + +__all__ = ['VCClusterTokenizer'] + + +class VCClusterTokenizer(_Tokenizer): + """A V*C*-cluster tokenizer. + + .. versionadded:: 0.4.0 + """ + + def __init__(self, scaler=None, consonants=None, vowels=None): + """Initialize tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(VCClusterTokenizer, self).__init__(scaler=scaler) + if consonants: + self._consonants = consonants + else: + self._consonants = set('bcdfghjklmnpqrstvwxzßBCDFGHJKLMNPQRSTVWXZ') + if vowels: + self._vowels = vowels + else: + self._vowels = set('aeiouyAEIOUY') + self._regexp = re.compile(r'\w+|[^\w\s]+', flags=0) + + def tokenize(self, string): + """Tokenize the term and store it. + + The tokenized term is stored as an ordered list and as a Counter + object. + + Parameters + ---------- + string : str + The string to tokenize + + Examples + -------- + >>> VCClusterTokenizer().tokenize('seven-twelfths') + VCClusterTokenizer({'s': 1, 'ev': 1, 'en': 1, '-': 1, 'tw': 1, + 'elfths': 1}) + + >>> VCClusterTokenizer().tokenize('character') + VCClusterTokenizer({'ch': 1, 'ar': 1, 'act': 1, 'er': 1}) + + + .. versionadded:: 0.4.0 + + """ + self._string = string + self._ordered_tokens = [] + token_list = self._regexp.findall(self._string) + for token in token_list: + if ( + token[0] not in self._consonants + and token[0] not in self._vowels + ): + self._ordered_tokens.append(token) + else: + token = unicodedata.normalize('NFD', token) + mode = 0 # 0 = starting mode, 1 = cons, 2 = vowels + new_token = '' # noqa: S105 + for char in token: + if char in self._consonants: + new_token += char + mode = 1 + elif char in self._vowels: + if mode == 1: + self._ordered_tokens.append(new_token) + new_token = char + else: + new_token += char + mode = 2 + else: # This should cover combining marks, marks, etc. + new_token += char + + self._ordered_tokens.append(new_token) + + self._ordered_tokens = [ + unicodedata.normalize('NFC', token) + for token in self._ordered_tokens + ] + super(VCClusterTokenizer, self).tokenize() + return self + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/tokenizer/_whitespace.py b/abydos/tokenizer/_whitespace.py new file mode 100644 index 000000000..aa86838c1 --- /dev/null +++ b/abydos/tokenizer/_whitespace.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._whitespace. + +Whitespace tokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._regexp import RegexpTokenizer + +__all__ = ['WhitespaceTokenizer'] + + +class WhitespaceTokenizer(RegexpTokenizer): + """A whitespace tokenizer. + + Examples + -------- + >>> WhitespaceTokenizer().tokenize('a b c f a c g e a b') + WhitespaceTokenizer({'a': 3, 'b': 2, 'c': 2, 'f': 1, 'g': 1, 'e': 1}) + + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, scaler=None, flags=0): + """Initialize tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(WhitespaceTokenizer, self).__init__( + scaler, regexp=r'\S+', flags=flags + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/tokenizer/_wordpunct.py b/abydos/tokenizer/_wordpunct.py new file mode 100644 index 000000000..8ce15e021 --- /dev/null +++ b/abydos/tokenizer/_wordpunct.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tokenizer._wordpunct. + +Wordpunct tokenizer (analogous to NLTK's workpunct tokenizer) +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from ._regexp import RegexpTokenizer + +__all__ = ['WordpunctTokenizer'] + + +class WordpunctTokenizer(RegexpTokenizer): + """A wordpunct tokenizer. + + Examples + -------- + >>> WordpunctTokenizer().tokenize("Can't stop the feelin'!") + WordpunctTokenizer({'Can': 1, "'": 1, 't': 1, 'stop': 1, 'the': 1, + 'feelin': 1, "'!": 1}) + + + .. versionadded:: 0.4.0 + + """ + + def __init__(self, scaler=None, flags=0): + """Initialize tokenizer. + + Parameters + ---------- + scaler : None, str, or function + A scaling function for the Counter: + + - None : no scaling + - 'set' : All non-zero values are set to 1. + - a callable function : The function is applied to each value + in the Counter. Some useful functions include math.exp, + math.log1p, math.sqrt, and indexes into interesting integer + sequences such as the Fibonacci sequence. + + + .. versionadded:: 0.4.0 + + """ + super(WordpunctTokenizer, self).__init__( + scaler, regexp=r'\w+|[^\w\s]+', flags=flags + ) + + +if __name__ == '__main__': + import doctest + + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) diff --git a/abydos/util/__init__.py b/abydos/util/__init__.py index bf107b292..b958c4414 100644 --- a/abydos/util/__init__.py +++ b/abydos/util/__init__.py @@ -33,7 +33,21 @@ unicode_literals, ) -__all__ = [] +from ._data import ( + data_path, + download_package, + list_available_packages, + list_installed_packages, + package_path, +) + +__all__ = [ + 'data_path', + 'download_package', + 'list_available_packages', + 'list_installed_packages', + 'package_path', +] if __name__ == '__main__': diff --git a/abydos/util/_data.py b/abydos/util/_data.py new file mode 100644 index 000000000..0fbf749ad --- /dev/null +++ b/abydos/util/_data.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.util._data. + +The util._data module manages datasets from +https://github.com/chrislit/abydos-data, including downloading them, +decompressing them, and locating them once installed. + +Much of this is copied from NLTK's similar facility in +http://www.nltk.org/_modules/nltk/data.html, because they seem to have the +issues figured out, because I don't want to expend the effort to re-invent a +solution, and because their license (Apache) allows for it. +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import os +import re +import sys + +try: + import urllib.request as urllib +except ImportError: # pragma: no cover + import urllib +import zipfile + +from xml.etree import ElementTree # noqa: S405 + +__all__ = [ + 'data_path', + 'download_package', + 'list_available_packages', + 'list_installed_packages', + 'package_path', +] + + +DATA_SUBDIRS = ['corpora'] +INDEX_URL = ( + 'https://raw.githubusercontent.com/chrislit/abydos-data/master/index.xml' +) + +data_path = [] +"""A list of directories where the Abydos data package might reside. + These directories will be checked in order when looking for a + resource in the data package. Note that this allows users to + substitute in their own versions of resources, if they have them + (e.g., in their home directory under ~/abydos_data).""" + +# User-specified locations: +_paths_from_env = os.environ.get('ABYDOS_DATA', str('')).split( + os.pathsep +) # pragma: no cover +data_path += [d for d in _paths_from_env if d] +if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/': + data_path.append(os.path.expanduser(str('~/abydos_data'))) + +if sys.platform.startswith('win'): # pragma: no cover + # Common locations on Windows: + data_path += [ + os.path.join(sys.prefix, str('abydos_data')), + os.path.join(sys.prefix, str('share'), str('abydos_data')), + os.path.join(sys.prefix, str('lib'), str('abydos_data')), + os.path.join( + os.environ.get(str('APPDATA'), str('C:\\')), str('abydos_data') + ), + str(r'C:\abydos_data'), + str(r'D:\abydos_data'), + str(r'E:\abydos_data'), + ] +else: + # Common locations on UNIX & OS X: + data_path += [ + os.path.join(sys.prefix, str('abydos_data')), + os.path.join(sys.prefix, str('share'), str('abydos_data')), + os.path.join(sys.prefix, str('lib'), str('abydos_data')), + str('/usr/share/abydos_data'), + str('/usr/local/share/abydos_data'), + str('/usr/lib/abydos_data'), + str('/usr/local/lib/abydos_data'), + ] + + +def package_path(resource_name): + """Given a resource name, returns the path to the package.""" + for path in data_path: + for subdir in DATA_SUBDIRS: + check_path = os.path.join(path, subdir, resource_name) + if os.path.isdir(check_path): + return check_path + msg = 'Data package not found. You may need to install or re-install it.' + raise FileNotFoundError(msg) + + +def list_installed_packages(path=None): + """List all installed data packages.""" + if path: + paths = [path] + else: + paths = data_path + packages = [] + for path in paths: + for subdir in DATA_SUBDIRS: + check_path = os.path.join(path, subdir) + if os.path.isdir(check_path): + possible_packages = os.listdir(check_path) + for package in possible_packages: + if os.path.isdir(os.path.join(check_path, package)): + with open( + os.path.join(check_path, package + '.xml') + ) as xml: + file = xml.read() + name = re.search(r'name="([^"]+)"', file).group(1) + version = re.search( + r'version="([^"]+)"', file + ).group(1) + packages.append((package, name, float(version))) + return packages + + +def list_available_packages(url=None): + """List all data packages available for install.""" + installed_packages = {_[0]: _[2] for _ in list_installed_packages()} + + if url is None: + url = INDEX_URL + if url[:8] != 'https://': + raise ValueError('url should begin with "https://"') + with urllib.urlopen(url) as ix: # noqa: S310 + xml = ElementTree.fromstring(ix.read()) # noqa: S314 + + packages = [ + ( + _.attrib['id'], + _.attrib['name'], + float(_.attrib['version']), + _.attrib['url'], + _.attrib['subdir'], + 'not-installed' + if _.attrib['id'] not in installed_packages + else ( + 'up-to-date' + if installed_packages[_.attrib['id']] + >= float(_.attrib['version']) + else 'update available' + ), + ) + for _ in xml.findall('packages/package') + ] + collections = [ + ( + _.attrib['id'], + _.attrib['name'], + [__.attrib['ref'] for __ in _.findall('item')], + ) + for _ in xml.findall('collections/collection') + ] + return packages, collections + + +def _default_download_dir(): + """Return the directory to which packages will be downloaded by default. + + This is mostly copied from NLTK's + nltk.downloader.Downloader.default_download_dir + + """ + # Check if we are on GAE where we cannot write into filesystem. + if 'APPENGINE_RUNTIME' in os.environ: # pragma: no cover + return + + # Check if we have sufficient permissions to install in a + # variety of system-wide locations. + for abydos_data in data_path: + if os.path.exists(abydos_data) and os.access( + abydos_data, os.W_OK + ): # pragma: no cover + return abydos_data + + # On Windows, use %APPDATA% + if sys.platform == 'win32' and 'APPDATA' in os.environ: # pragma: no cover + homedir = os.environ['APPDATA'] + + # Otherwise, install in the user's home directory. + else: # pragma: no cover + homedir = os.path.expanduser('~/') + if homedir == '~/': + raise ValueError('Could not find a default download directory') + + # append "abydos_data" to the home directory + return os.path.join(homedir, 'abydos_data') # pragma: no cover + + +def download_package( + resource_name, url=None, data_path=None, force=False, silent=False +): + """Download and install a package or collection.""" + packages, collections = list_available_packages(url) + installed = list_installed_packages(data_path) + if data_path is None: + data_path = _default_download_dir() + os.makedirs(data_path, mode=0o775, exist_ok=True) + + for coll in collections: + if resource_name == coll[0]: + if not silent: # pragma: no branch + print('Installing {} collection'.format(coll[1])) # noqa: T001 + for resource_name in coll[2]: + download_package(resource_name, url, data_path) + return + else: + for pack in packages: + if resource_name == pack[0]: + if not force: + for inst in installed: # pragma: no branch + if pack[0] == inst[0] and pack[2] <= inst[2]: + if not silent: + print( # pragma: no cover # noqa: T001 + '{} package already up-to-date'.format( + pack[1] + ) + ) + return + if not silent: # pragma: no branch + print( # noqa: T001 + 'Installing {} package'.format(pack[1]) + ) + zip_fn = os.path.join(data_path, pack[4], pack[0] + '.zip') + os.makedirs( + os.path.join(data_path, pack[4]), mode=0o775, exist_ok=True + ) + urllib.urlretrieve( # noqa: S310 + pack[3][:-3] + 'xml', zip_fn[:-3] + 'xml' + ) + urllib.urlretrieve(pack[3], zip_fn) # noqa: S310 + zip_pkg = zipfile.ZipFile(zip_fn) + zip_pkg.extractall(os.path.join(data_path, pack[4])) + zip_pkg.close() + os.remove(zip_fn) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/util/_ncr.py b/abydos/util/_ncr.py new file mode 100644 index 000000000..234f356ee --- /dev/null +++ b/abydos/util/_ncr.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.util._ncr. + +The util._ncr module defines _ncr, which computes n Choose r. +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +from math import factorial, gamma + +__all__ = [] + + +def _ncr(n, r): + r"""Return n Choose r. + + Cf. https://en.wikipedia.org/wiki/Combination + + Parameters + ---------- + n : float + The number of elements in the set/multiset + r : float + The number of elements to choose + + Returns + ------- + int or float + n Choose r + + Examples + -------- + >>> _ncr(4, 2) + 6 + >>> _ncr(10, 3) + 120 + + .. versionadded:: 0.4.0 + + """ + if isinstance(r, int) and isinstance(n, int): + if not r: + return 1 + if r > n: + return 0 + return int(factorial(n) / (factorial(r) * factorial(n - r))) + return gamma(n + 1) / (gamma(r + 1) * gamma(n - r + 1)) + + +if __name__ == '__main__': + import doctest + + doctest.testmod() diff --git a/abydos/util/_prod.py b/abydos/util/_prod.py index fea4192cd..c0cad07ac 100644 --- a/abydos/util/_prod.py +++ b/abydos/util/_prod.py @@ -18,7 +18,7 @@ """abydos.util._prod. -The util._prod module defines prod, which computes the product of a collection +The util._prod module defines _prod, which computes the product of a collection of numbers (akin to sum, but for product). """ @@ -39,7 +39,11 @@ def _prod(nums): r"""Return the product of nums. - The product is :math:`\prod nums`. + The product is + + .. math:: + + \prod nums Cf. https://en.wikipedia.org/wiki/Product_(mathematics) @@ -64,6 +68,8 @@ def _prod(nums): >>> _prod(2**i for i in range(5)) 1024 + .. versionadded:: 0.1.0 + """ return reduce(mul, nums, 1) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 08e6c897b..b6946dc65 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,7 +26,7 @@ jobs: versionSpec: '$(python.version)' architecture: 'x64' - - script: python -m pip install --upgrade pip && pip install -r requirements.txt + - script: python -m pip install --upgrade pip && pip install -r requirements.txt && pip install -r requirements-dev.txt displayName: 'Install dependencies' - script: | @@ -68,7 +68,7 @@ jobs: versionSpec: '$(python.version)' architecture: 'x64' - - script: python -m pip install --upgrade pip && pip install -r requirements.txt + - script: python -m pip install --upgrade pip && pip install -r requirements.txt && pip install -r requirements-dev.txt displayName: 'Install dependencies' - script: | @@ -111,7 +111,7 @@ jobs: versionSpec: '$(python.version)' architecture: 'x64' - - script: python -m pip install --upgrade pip && pip install -r requirements.txt + - script: python -m pip install --upgrade pip && pip install -r requirements.txt && pip install -r requirements-dev.txt displayName: 'Install dependencies' - script: | diff --git a/btest.sh b/btest.sh index 857d9a316..0d6efe4d2 100755 --- a/btest.sh +++ b/btest.sh @@ -44,6 +44,6 @@ if [ "$docs_only" = "0" ]; then fi fi -sphinx-apidoc -e -f -M -o docs ./abydos +sphinx-apidoc -e -M -o docs ./abydos cd docs || exit make html epub xelatexpdf >> /dev/null 2> /dev/null diff --git a/data/features/features_csv_to_dict.py b/data/features/features_csv_to_dict.py index 340d888b0..0bc6fbaec 100755 --- a/data/features/features_csv_to_dict.py +++ b/data/features/features_csv_to_dict.py @@ -317,7 +317,7 @@ def check_entailments(sym, features, feature_mask): checkset_f.add(featint) if variant < 5: - oline = ' \'{}\': {},'.format( + oline = " '{}': {},".format( symbol, featint ) else: @@ -331,7 +331,7 @@ def check_entailments(sym, features, feature_mask): mag = len(keyline) for i in range(len(keyline)): features = int('0b' + ('00' * i) + '11' + ('00' * (mag - i - 1)), 2) - oline = ' \'{}\': {},'.format(keyline[i], features) + oline = " '{}': {},".format(keyline[i], features) ofile.write(oline + '\n') ofile.write(' }\n') diff --git a/docs/abydos.bib b/docs/abydos.bib index 19130e009..860aff7d4 100644 --- a/docs/abydos.bib +++ b/docs/abydos.bib @@ -1,7 +1,16 @@ -% This file was created with JabRef 2.10. % Encoding: UTF-8 +@InProceedings{Abreu:2007, + Title = {An Evaluation of Similarity Coefficients for Software Fault Localization}, + Author = {Abreu, Rui and Zoeteweij, Peter and {van Gemund}, {Arjan J. C.}}, + Booktitle = {2006 12th Pacific Rim International Symposium on Dependable Computing (PRDC'06)}, + Year = {2007}, + + Doi = {10.1109/PRDC.2006.18}, + Journal = {IEEE Xplore} +} + @Misc{Adams:2017, Title = {Ruby port of UEALite Stemmer}, @@ -26,6 +35,45 @@ @Article{Amon:2012 Url = {http://www.scielo.org.co/scielo.php?pid=S1692-33242012000100011&script=sci_abstract&tlng=es} } +@Book{Anderberg:1973, + Title = {Cluster Analysis for Applications}, + Author = {Anderberg, {Michael R.}}, + Publisher = {Academic Press}, + Year = {1973}, + + Address = {New York}, + + Doi = {10.1016/C2013-0-06161-0} +} + +@Article{Andres:2004, + Title = {Delta: a new measure of agreement between two raters}, + Author = {Andr{\'{e}}s, {A. Mart{\'{i}}n} and Marzo, {P. Femia}}, + Journal = {British Journal of Mathematical and Statistical Psychology}, + Year = {2004}, + + Month = {{may}}, + Number = {1}, + Pages = {1--20}, + Volume = {57}, + + Doi = {10.1348/000711004849268} +} + +@Article{Austin:1977, + Title = {Evaluation of Some Coefficients for Use in Numerical Taxonomy of Microorganisms}, + Author = {Austin, Brian and Colwell, {Rita R.}}, + Journal = {International Journal of Systematic Bacteriology}, + Year = {1977}, + + Month = {{jul}}, + Number = {3}, + Pages = {204--210}, + Volume = {27}, + + Doi = {10.1099/00207713-27-3-204} +} + @TechReport{Axelsson:2009, Title = {SfinxBis}, Author = {Axelsson, P{\aa}l}, @@ -36,6 +84,20 @@ @TechReport{Axelsson:2009 Url = {http://www.swami.se/download/18.248ad5af12aa8136533800091/SfinxBis.pdf} } +@Article{BaroniUrbani:1976, + Title = {Similarity of Binary Data}, + Author = {Baroni-Urbani, Cesare and Buser, {Mauro W.}}, + Journal = {Systematic Biology}, + Year = {1976}, + + Month = {{sep}}, + Number = {3}, + Pages = {251--259}, + Volume = {25}, + + Doi = {10.2307/2412493} +} + @InProceedings{Bartolini:2002, Title = {String Matching with Metric Trees Using an Approximate Distance}, Author = {Bartolini, Ilaria and Ciaccia, Paolo and Patella, Marco}, @@ -52,6 +114,44 @@ @InProceedings{Bartolini:2002 Url = {http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf} } +@Article{Batagelj:1995, + Title = {Comparing Resemblance Measures}, + Author = {Batagelj, Vladimir and Bren, Matev\v{z}}, + Journal = {Journal of Classification}, + Year = {1995}, + + Month = {{mar}}, + Number = {1}, + Pages = {73--90}, + Volume = {12}, + + Doi = {10.1007/BF01202268} +} + +@Article{Baulieu:1997, + Title = {Two Variant Axiom Systems for Presence/Absence Based Dissimilarity Coefficients}, + Author = {Baulieu, {Forrest B.}}, + Journal = {Journal of Classification}, + Year = {1997}, + Number = {1}, + Pages = {159--170}, + Volume = {14}, + + Doi = {10.1007/s003579900009} +} + +@Article{Baulieu:1989, + Title = {A Classification of Presence/Absence Based Dissimilarity Coefficients}, + Author = {Baulieu, {Forrest B.}}, + Journal = {Journal of Classification}, + Year = {1989}, + Number = {1}, + Pages = {233--246}, + Volume = {6}, + + Doi = {10.1007/BF01908601} +} + @Article{Beider:2008, Title = {Beider-Morse Phonetic Matching: An Alternative to Soundex with Fewer False Hits}, Author = {Beider, Alexander and Morse, {Stephen P.}}, @@ -64,6 +164,45 @@ @Article{Beider:2008 Url = {https://stevemorse.org/phonetics/bmpm.htm} } +@Book{Benini:1901, + Title = {Principii di Demografia}, + Author = {Benini, Rudolfo}, + Publisher = {G. Barbera}, + Year = {1901}, + + Address = {Firenze}, + Number = {29}, + Series = {Manuali Barbera di Scienze Giuridiche Sociali e Politiche}, + + Url = {http://www.archive.org/stream/principiididemo00benigoog} +} + +@Article{Bennet:1954, + Title = {Communications Through Limited-Response Questioning}, + Author = {Bennet, {E. M.} and Alpert, {R.} and Goldstein, {A. C.}}, + Journal = {Public Opinion Quarterly}, + Year = {1954}, + Number = {3}, + Pages = {303--308}, + Volume = {18}, + + Doi = {10.1086/266520} +} + +@Article{Bhattacharyya:1946, + Title = {On a Measure of Divergence between Two Multinomial Populations}, + Author = {Bhattacharyya, {Anil Kumar}}, + Journal = {Sankhyā: The Indian Journal of Statistics (1933-1960)}, + Year = {1946}, + + Month = {{jul}}, + Number = {4}, + Pages = {401--406}, + Volume = {7}, + + Doi = {10.2307/25047882} +} + @Article{Bouchard:1981, Title = {FONEM: Un code de transcription phon{\'{e}}tique pour la reconstitution automatique des familles saguenayennes}, Author = {Bouchard, G{\'{e}}rard and Brard, Patrick and Lavoie, Yolande}, @@ -107,6 +246,56 @@ @Article{Boytsov:2011 Publisher = {ACM} } +@Article{Brainerd:1951, + Title = {The Place of Chronological Ordering in Archaeological Analysis}, + Author = {Brainerd, {George W.}}, + Journal = {American Antiquity}, + Year = {1951}, + + Month = {{apr}}, + Number = {4}, + Pages = {301--313}, + Volume = {16}, + + Doi = {10.2307/276979} +} + +@Book{BraunBlanquet:1932, + Title = {Plant Sociology: The Study of Plant Communities}, + Author = {Braun-Blanquet, Josias}, + Publisher = {McGraw-Hill Book Company}, + Year = {1932}, + + Address = {New York}, + + Url = {https://archive.org/details/plantsociologyst00brau} +} + +@Article{Bray:1957, + Title = {An ordination of upland forest communities of southern Wisconsin}, + Author = {Bray, {J. Roger} and Curtis, {John T.}}, + Journal = {Ecological Monographs}, + Year = {1957}, + + Month = {{feb}}, + Number = {4}, + Pages = {325--349}, + Volume = {27}, + + Doi = {10.2307/1942268}, + Url = {http://cescos.fau.edu/gawliklab/papers/BrayJRandJTCurtis1957.pdf} +} + +@InProceedings{Broder:1997, + Title = {On the resemblance and containment of documents}, + Author = {Broder, {Andrei Z.}}, + Booktitle = {Compression and Complexity of Sequences: Proceedings, Positano, Amalfitan Coast, Salerno, Italy, June 11-13, 1997}, + Year = {1997}, + Pages = {21--29}, + + Doi = {10.1109/SEQUEN.1997.666900} +} + @TechReport{Burrows:1994, Title = {A block sorting lossless data compression algorithm}, Author = {Burrows, Michael and Wheeler, {David J.}}, @@ -130,6 +319,37 @@ @TechReport{Caumanns:1999 Url = {https://refubium.fu-berlin.de/bitstream/handle/fub188/18405/tr-b-99-16.pdf} } +@Article{Cha:2006, + Title = {Enhancing Binary Feature Vector Similarity Measures}, + Author = {Cha, {Sung-Hyuk} and Tappert, {Charles C.} and Yoon, Sungsoo}, + Journal = {Journal of Pattern Recognition Research}, + Year = {2006}, + Number = {1}, + Pages = {63--77}, + Volume = {1}, + + Doi = {10.13176/11.20} +} + +@InProceedings{Cha:2008, + Title = {Taxonomy of Nominal Type Histogram Distance Measures}, + Author = {Cha, Sung-Hyuk}, + Booktitle = {Proceedings of the American Conference on Applied Mathematics (MATH '08)}, + Year = {2008}, + + Url = {http://www.wseas.us/e-library/conferences/2008/harvard/math/49-577-887.pdf} +} + +@Article{Choi:2010, + Title = {A Survey of Binary Similarity and Distance Measures}, + Author = {Choi, Seung-Seok and Cha, Sung-Hyuk and Tappert, {Charles C.}}, + Journal = {Systemics, Cybernetics and Informatics}, + Year = {2010}, + Number = {1}, + Pages = {43--48}, + Volume = {8} +} + @Misc{Christen:2011, Title = {Febrl (Freely extensible biomedical record linkage) -- encode.py}, @@ -140,6 +360,29 @@ @Misc{Christen:2011 Url = {https://sourceforge.net/projects/febrl/} } +@TechReport{Christen:2006, + Title = {A Comparison of Personal Name Matching: Techniques and Practical Issues}, + Author = {Christen, Peter}, + Institution = {Australian National University}, + Year = {2006}, + + Address = {Canberra, Australia}, + Number = {TR-CS-06-02}, + + Url = {https://openresearch-repository.anu.edu.au/bitstream/1885/44521/3/TR-CS-06-02.pdf} +} + +@InCollection{Church:1991, + Title = {Using statistics in lexical analysis}, + Author = {Church, Kenneth and Gale, William and Hanks, Patrick and Hindle, Donald}, + Booktitle = {Lexical Acquisition: Exploiting On-Line Resources to Build up a Lexicon}, + Publisher = {Lawrence Erlbaum}, + Year = {1991}, + + Address = {Hillsdale, NJ}, + Pages = {115--164} +} + @Misc{Churchill:2005, Title = {UEAstem.java}, @@ -179,22 +422,133 @@ @Article{Cislak:2017 Url = {http://arxiv.org/abs/1711.08475} } -@Misc{rosettacode:2018, - Title = {Run-length encoding}, +@Article{Clement:1976, + Title = {A Formula for Computing Inter-Observer Agreement}, + Author = {Clement, {Paul W.}}, + Journal = {Psychological Reports}, + Year = {1976}, + Number = {1}, + Pages = {257--258}, + Volume = {39}, - Author = {Rosetta Code}, - Year = {2018}, + Doi = {10.2466/pr0.1976.39.1.257} +} - Url = {https://rosettacode.org/wiki/Run-length_encoding#Python} +@InProceedings{Cohen:2003, + Title = {A Comparison of String Distance Metrics for Name-Matching Tasks}, + Author = {Cohen, {William A.} and Ravikumar, Pradeep and Fienberg, {Stephen E.}}, + Booktitle = {IIWEB'03 Proceedings of the 2003 International Conference on Information}, + Year = {2003}, + Pages = {73--78}, + + Url = {http://www.cs.cmu.edu/~wcohen/postscript/ijcai-ws-2003.pdf} } -@Misc{rosettacode:2018b, - Title = {Longest common subsequence}, +@Misc{Cohen:2003b, + Title = {SecondString}, - Author = {Rosetta Code}, - Year = {2018}, + Author = {Cohen, {William W.} and Ravikumar, Pradeep and Fienberg, {Stephen E.} and Rivard, Kathryn}, + Year = {2003}, - Url = {http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6} + Url = {https://github.com/TeamCohen/secondstring} +} + +@Misc{Cohen:2011, + Title = {FuzzyWuzzy: Fuzzy String Matching in Python}, + + Author = {Cohen, Adam}, + Month = {{jul}}, + Year = {2011}, + + Url = {https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/} +} + +@Article{Cohen:1960, + Title = {A coefficient of agreement for nominal scales}, + Author = {Cohen, Jacob}, + Journal = {Educational and Psychological Measurement}, + Year = {1960}, + Number = {1}, + Pages = {37--46}, + Volume = {20}, + + Doi = {10.1177/001316446002000104} +} + +@Article{Cole:1949, + Title = {The Measurement of Interspecific Association}, + Author = {Cole, {Lamont C.}}, + Journal = {Ecology}, + Year = {1949}, + Number = {4}, + Pages = {411--424}, + Volume = {30}, + + Doi = {10.2307/1932444} +} + +@Article{Connolly:1997, + Title = {Quantifying target-realization differences}, + Author = {Connolly, {John H.}}, + Journal = {Clinical Linguistics \& Phonetics}, + Year = {1997}, + Number = {4}, + Pages = {267--287}, + Volume = {11}, + + Doi = {10.3109/02699209708985195} +} + +@Article{Consonni:2012, + Title = {New Similarity Coefficients for Binary Data}, + Author = {Consonni, Viviana and Todeschini, Roberto}, + Journal = {MATCH Communications in Mathematical and in Computer Chemistry}, + Year = {2012}, + Pages = {581--592}, + Volume = {68} +} + +@PhdThesis{Cormode:2003, + Title = {Seuqnce Distance Embeddings}, + Author = {Cormode, Graham}, + School = {The University of Warwick}, + Year = {2003}, + + Url = {http://wrap.warwick.ac.uk/61310/7/WRAP_THESIS_Cormode_2003.pdf} +} + +@InProceedings{Cormode:2000, + Title = {Communication Complexity of Document Exchange}, + Author = {Cormode, Graham and Paterson, Mike and Sahinalp, {Süleyman Cenk} and Vishkin, Uzi}, + Booktitle = {SODA '00 Proceedings of the eleventh annual ACM-SIAM symposium on Discrete algorithms}, + Year = {2000}, + Pages = {197--200} +} + +@Article{Covington:1996, + Title = {An Algorithm to Align Words for Historical Comparison}, + Author = {Covington, {Michael A.}}, + Journal = {Computational Linguistics}, + Year = {1996}, + + Month = {{dec}}, + Number = {4}, + Pages = {481--496}, + Volume = {22} +} + +@Article{Cronbach:1951, + Title = {Coefficient Alpha and the Internal Structure of Tests}, + Author = {Cronbach, {Lee J.}}, + Journal = {Psychometrika}, + Year = {1951}, + + Month = {{sep}}, + Number = {3}, + Pages = {297--334}, + Volume = {16}, + + Doi = {10.1007/BF02310555} } @TechReport{Cunningham:1969, @@ -207,6 +561,29 @@ @TechReport{Cunningham:1969 Url = {https://files.eric.ed.gov/fulltext/ED029679.pdf} } +@Article{Czekanowski:1909, + Title = {Zur Differentialdiagnose der Neandertalgruppe}, + Author = {Czekanowski, Jan}, + Journal = {Korrespondenz-Blatt der Deutschen Gesellschaft für Anthropologie, Ethnologie und Urgeschichte}, + Year = {1909}, + Pages = {44--47}, + Volume = {40} +} + +@Article{Dagan:1999, + Title = {Similarity-Based Models of Word Cooccurrence Probabilities}, + Author = {Dagan, Ido and Lee, Lillian and Pereire, {Fernando C. N.}}, + Journal = {Machine Learning}, + Year = {1999}, + + Month = {{feb}}, + Number = {1--3}, + Pages = {43--69}, + Volume = {34}, + + Doi = {10.1023/A:1007537716579} +} + @Misc{Dalke:2005, Title = {Arithmetic Coder (Python Recipe)}, @@ -216,6 +593,16 @@ @Misc{Dalke:2005 Url = {http://code.activestate.com/recipes/306626/} } +@InProceedings{Dallmeier:2005, + Title = {Lightweight}, + Author = {Dallmeier, Valentin and Lindig, Christian and Zeller, Andreas}, + Booktitle = {ECOOP'05 Proceedings of the 19th European conference on Object-Oriented Programming}, + Year = {2005}, + + Doi = {10.1007/11531142_23}, + Url = {https://www.st.cs.uni-saarland.de/papers/dlz2004/dlz2004.pdf} +} + @Article{Damerau:1964, Title = {A Technique for Computer Detection and Correction of Spelling Errors}, Author = {Damerau, {Fred J.}}, @@ -265,9 +652,18 @@ @Misc{dcm4che:2011 Url = {https://github.com/dcm4che/dcm4che/blob/master/dcm4che-soundex/src/main/java/org/dcm4che3/soundex/Phonem.java} } +@InProceedings{Higuera:2008, + Title = {A Contextual Normalised Edit Distance}, + Author = {{de la Higuera}, Colin and Micó, Luisa}, + Booktitle = {First International Workshop on Similarity Search and Applications (sisap 2008)}, + Year = {2008}, + + Doi = {10.1109/SISAP.2008.17} +} + @InProceedings{delPilarAngeles:2016, Title = {Performance of Spanish Encoding Functions during Record Linkage}, - Author = {{del Pilar Angeles}, Mar{\'{\i}}a and Bail{\'{o}}n-Miguel, Noemi}, + Author = {{del Pilar Angeles}, Mar{\'{i}}a and Bail{\'{o}}n-Miguel, Noemi}, Booktitle = {DATA ANALYTICS 2016: The Fifth International Conference on Data Analysis}, Year = {2016}, Pages = {1--7}, @@ -277,7 +673,7 @@ @InProceedings{delPilarAngeles:2016 @InProceedings{delPilarAngeles:2015, Title = {Comparison of a Modified Spanish Phonetic, Soundex, and Phonex coding functions during data matching process}, - Author = {{del Pilar Angeles}, Mar{\'{\i}}a and Espino-Gamez, Adri{\'{a}}n and Gil-Moncada, Jonathan}, + Author = {{del Pilar Angeles}, Mar{\'{i}}a and Espino-Gamez, Adri{\'{a}}n and Gil-Moncada, Jonathan}, Booktitle = {2015 International Conference on Informatics, Electronics Vision (ICIEV)}, Year = {2015}, Month = jun, @@ -288,6 +684,33 @@ @InProceedings{delPilarAngeles:2015 Url = {https://www.researchgate.net/publication/285589803_Comparison_of_a_Modified_Spanish_Phonetic_Soundex_and_Phonex_coding_functions_during_data_matching_process} } +@InProceedings{Dennis:1965, + Title = {The Construction of a Thesaurus Automatic From a Sample of Text}, + Author = {Dennis, {Sally F.}}, + Booktitle = {Statistical Association Techniques for Mechanized Documentation: Symposium Proceedings}, + Year = {1965}, + + Address = {Washington, D.C.}, + Editor = {Stevens, {Mary Elizabeth} and Giuliano, {Vincent E.} and Heilprin, {Laurence B.}}, + Month = {{dec}}, + Number = {269}, + Organization = {United States Department of Commerce}, + Pages = {61--148}, + Series = {National Bureau of Standards Miscellaneous Publication}, + + Url = {https://archive.org/details/statisticalassoc269stev} +} + +@Book{Deza:2016, + Title = {Encyclopedia of Distances}, + Author = {Deza, {Michel Marie} and Deza, Elena}, + Publisher = {Springer-Verlag}, + Year = {2016}, + + Address = {Berlin}, + Edition = {4} +} + @Article{Dice:1945, Title = {Measures of the Amount of Ecologic Association Between Species}, Author = {Dice, {Lee R.}}, @@ -301,6 +724,20 @@ @Article{Dice:1945 Url = {https://www.jstor.org/stable/1932409} } +@Article{Digby:1983, + Title = {Approximating the Tetrachoric Correlation Coefficient}, + Author = {Digby, {P. G. N.}}, + Journal = {Biometrics}, + Year = {1983}, + + Month = {{sep}}, + Number = {3}, + Pages = {753--757}, + Volume = {39}, + + Doi = {10.2307/2531104} +} + @Article{Dolby:1970, Title = {An Algorithm For Variable-Length Proper-Name Compression}, Author = {Dolby, {James L.}}, @@ -314,6 +751,97 @@ @Article{Dolby:1970 Url = {https://ejournals.bc.edu/ojs/index.php/ital/article/download/5259/4734} } +@Article{Doolittle:1884, + Title = {The Verification of Predictions}, + Author = {Doolittle, {Mayrick H.}}, + Journal = {The American Meteorological Journal}, + Year = {1884}, + Pages = {327--329}, + Volume = {2}, + + Url = {https://books.google.com/books?id=2f0wAQAAMAAJ&dq=%22Finley's%20Tornado%20Predictions%22%20Gilbert&pg=PA327#v=onepage&q&f=false} +} + +@Article{Downey:2008, + Title = {Computational FeatureSensitive Reconstruction of Language Relationships: Developing the ALINE Distance for Comparative Historical Linguistic Reconstruction}, + Author = {Downey, {Sean S.} and Hallmark, Brian and Cox, {Murray P.} and Norquest, Peter and Lansing, {J. Stephen}}, + Journal = {Journal of Quantitative Linguistics}, + Year = {2008}, + + Month = {{nov}}, + Number = {4}, + Pages = {340--369}, + Volume = {15}, + + Doi = {10.1080/09296170802326681} +} + +@Article{Downey:2017, + Title = {alineR: an R Package for Optimizing Feature-Weighted Alignments and Linguistic Distances}, + Author = {Downey, {Sean S.} and Sun, Guowei and Norquest, Peter}, + Journal = {The R Journal}, + Year = {2017}, + Number = {1}, + Pages = {138--152}, + Volume = {9}, + + Doi = {10.32614/RJ-2017-005}, + Url = {https://journal.r-project.org/archive/2017/RJ-2017-005/RJ-2017-005.pdf} +} + +@Article{Driver:1932, + Title = {Quantitative Expression of Cultural Relationships}, + Author = {Driver, {Harold E.} and Kroeber, {Alfred L.}}, + Journal = {University of California Publications in American Archaeology and Ethnology}, + Year = {1932}, + Number = {4}, + Pages = {211--256}, + Volume = {31}, + + Publisher = {University of California Press}, + Url = {http://digitalassets.lib.berkeley.edu/anthpubs/ucb/text/ucp031-005.pdf} +} + +@Article{Dunning:1993, + Title = {Accurate Methods for the Statistics of Surprise and Coincidence}, + Author = {Dunning, Ted}, + Journal = {Computational Linguistics}, + Year = {1993}, + Number = {1}, + Pages = {61--74}, + Volume = {19}, + + Url = {http://www.aclweb.org/anthology/J93-1003} +} + +@Article{Ehrenfeucht:1988, + Title = {A new distance metric on strings computable in linear time}, + Author = {Ehrenfeucht, Andrzej and Haussler, David}, + Journal = {Discrete Applied Mathematics}, + Year = {1988}, + Number = {3}, + Pages = {191--203}, + Volume = {20}, + + Doi = {10.1016/0166-218X(88)90076-5} +} + +@Book{Eidenberger:2014, + Title = {Categorization and Machine Learning: The ModModel of Human Understanding in Computers}, + Author = {Eidenberger, Horst}, + Publisher = {atpress}, + Year = {2014} +} + +@Book{Ellenberg:1956, + Title = {Grundlagen Der Vegetationsgliederung. Teil 1. Aufgaben Und Methoden Der Vegetationskunde}, + Author = {Ellenberg, Heinz}, + Publisher = {Verlag Eugen Ulmer}, + Year = {1956}, + + Address = {Stuttgart} +} + @TechReport{Elovitz:1976, Title = {Automatic Translation of English Text to PPhonetic by Means of Letter-to-Sound Rules}, Author = {Elovitz, {Honey S.} and Johnson, {Rodney W.} and McHugh, Astrid and Shore, {John E.}}, @@ -325,6 +853,28 @@ @TechReport{Elovitz:1976 Type = {NRL Report} } +@TechReport{Erikson:1997, + Title = {Approximate Swedish name matching - survey and test of different algorithms}, + Author = {Erikson, Klas}, + Institution = {KTH, Royal Institute of Technology}, + Year = {1997}, + + Address = {Stockholm, Sweden}, + Number = {TRITA-NA-E9721}, + Type = {Nada report}, + + Url = {ftp://ftp.nada.kth.se/pub/documents/Theory/Viggo-Kann/NADA-E9721.pdf} +} + +@Article{Eyraud:1938, + Title = {Les principes de la mesure des corr{\'{e}}lations}, + Author = {Eyraud, Henri}, + Journal = {Annales de l'Universit{/e} de Lyon, III Series, Section A}, + Year = {1938}, + Pages = {30--47}, + Volume = {1} +} + @Article{Furnohr:2002, Title = {Zusammenf{\"{u}}hrung von Datenbest{\"{a}}nden ohne numerische Identifikatoren: ein Verfahren im Rahmen der Testuntersuchungen zu einem registergest{\"{u}}tzten Zensus}, Author = {F{\"{u}}rnrohr, Michael and Rimmelspacher, Birgit and von Roncador, Tilman}, @@ -337,6 +887,99 @@ @Article{Furnohr:2002 Url = {https://www.statistik.bayern.de/medien/statistik/zensus/zusammenf__hrung_von_datenbest__nden_ohne_numerische_identifikatoren.pdf} } +@Article{Fager:1957, + Title = {Determination and Analysis of Recurrent Groups}, + Author = {Fager, {Edward W.}}, + Journal = {Ecology}, + Year = {1957}, + + Month = {{oct}}, + Number = {4}, + Pages = {586--595}, + Volume = {38}, + + Doi = {10.2307/1943124} +} + +@Article{Fager:1963, + Title = {Zooplankton Species Groups in the North Pacific}, + Author = {Fager, {Edward W.} and McGowan, {John A.}}, + Journal = {Science}, + Year = {1963}, + Number = {3566}, + Pages = {453--460}, + Volume = {140}, + + Doi = {10.1126/science.140.3566.453} +} + +@Article{Faith:1983, + Title = {Asymmetric Binary Similarity Measures}, + Author = {Faith, {Daniel P.}}, + Journal = {Oecologia}, + Year = {1983}, + + Month = {{mar}}, + Number = {3}, + Pages = {287--290}, + Volume = {57}, + + Doi = {10.1007/BF00377169} +} + +@Article{Fleiss:1975, + Title = {Measuring Agreement Between Two Judges on the Presence or Absence of a Trait}, + Author = {Fleiss, {Joseph L.}}, + Journal = {Biometrics}, + Year = {1975}, + Number = {3}, + Pages = {651--659}, + Volume = {31}, + + Doi = {10.2307/2529549} +} + +@Book{Fleiss:2003, + Title = {Statistical Methods for Rates and Proportions}, + Author = {Fleiss, {Joseph L.} and Levin, Bruce and Paik, {Myunghee Cho}}, + Publisher = {John Wiley \& Sons}, + Year = {2003}, + + Address = {Hoboken}, + Edition = {3rd}, + Series = {Wiley Series in Probability and Statistics} +} + +@Article{Forbes:1925, + Title = {Method of determining and measuring the associative relations of species}, + Author = {Forbes, {Stephen A.}}, + Journal = {Science}, + Year = {1925}, + Number = {1585}, + Pages = {518--524}, + Volume = {61} +} + +@Article{Forbes:1907, + Title = {On the Local Distribution of Certain Illinois Fishes: An Essay in Statistical Ecology}, + Author = {Forbes, {Stephen A.}}, + Journal = {Bulletin of the Illinois State Laboratory of Natural History}, + Year = {1907}, + Pages = {273--303}, + Volume = {7} +} + +@TechReport{Fossum:1966, + Title = {Optimization and Standardization of Information Retrieval Language and Systems}, + Author = {Fossum, {Earl G.} and Kaskey, Gilbert}, + Institution = {Directorate of Information Sciences, Air Force Office of Scientific Research, Office of Aerospace Research, United States Air Force}, + Year = {1966}, + + Address = {Washington, D.C.}, + + Url = {https://archive.org/details/DTIC_AD0630797} +} + @Article{Gadd:1990, Title = {PHONIX: The algorithm}, Author = {Gadd, {T. N.}}, @@ -355,32 +998,141 @@ @Misc{Garshol:2015 Author = {Garshol, {Lars Marius}}, Year = {2015}, - Url = {https://github.com/larsga/Duke/blob/master/duke-core/src/main/java/no/priv/garshol/duke/comparators/NorphoneComparator.java} + Url = {https://github.com/larsga/Duke/blob/master/duke-core/src/main/java/no/priv/garshol/duke/comparators/NorphoneComparator.java} +} + +@Article{Wilde:1988, + Title = {Nicht w{\"{o}}rtlich genommen, 'Schreibweisentolerante' Suchroutine in dBASE implementiert}, + Author = {Georg, Wilde and Meyer, Carsten}, + Journal = {c't Magazin für Computer Technik}, + Year = {1988}, + + Month = oct, + Number = {10}, + Pages = {126--131} +} + +@Article{Gilbert:1884, + Title = {Finley's Tornado Predictions}, + Author = {Gilbert, {Grove K.}}, + Journal = {American Meteorological Journal}, + Year = {1884}, + Pages = {166--172}, + Volume = {1} +} + +@Article{Gilbert:1966, + Title = {Analysis of Quadrat Data}, + Author = {Gilbert, N. and Wells, {Terry C. E.}}, + Journal = {Journal of Ecology}, + Year = {1966}, + + Month = {{nov}}, + Number = {3}, + Pages = {675--685}, + Volume = {54}, + + Doi = {10.2307/2257810} +} + +@InProceedings{Gill:1997, + Title = {OX-LINK: The Oxford Medical Record Linkage System}, + Author = {Gill, {Leicester E.}}, + Booktitle = {Record Linkage Techniques}, + Year = {1997}, + + Address = {Washington, D.C.}, + Month = mar, + Organization = {Federal Committee on Statistical Methodology}, + Publisher = {Office of Management and Budget}, + + Url = {https://pdfs.semanticscholar.org/fff7/02a3322e05c282a84064ee085e589ef74584.pdf} +} + +@Article{Gini:1915, + Title = {Nuovi contributi all teoria delle relazioni statistiche}, + Author = {Gini, Corrado}, + Journal = {Atti del Reale Istituto Veneto di Scienze, Lettere ed Arti, Series 8}, + Year = {1915}, + Number = {2}, + Pages = {1903--1942}, + Volume = {74} +} + +@Book{Gini:1912, + Title = {Variabilit{\`{a}} e mutabilit{\`{a}}}, + Author = {Gini, Corrado}, + Publisher = {C. Cuppini}, + Year = {1912}, + + Address = {Bologna}, + Series = {Contributo allo Studio delle Distribuzioni e delle Relazioni Statistiche} +} + +@Article{Gleason:1920, + Title = {Some Applications of the Quadrat Method}, + Author = {Gleason, {Henry Allan}}, + Journal = {Bulletin of the Torrey Botanical Club}, + Year = {1920}, + + Month = {{jan}}, + Number = {1}, + Pages = {21--33}, + Volume = {47}, + + Doi = {10.2307/2480223} +} + +@Article{Goodall:1967, + Title = {The Distribution of the Matching Coefficient}, + Author = {Goodall, {David W.}}, + Journal = {Biometrics}, + Year = {1967}, + + Month = {{dec}}, + Number = {4}, + Pages = {647--656}, + Volume = {23}, + + Doi = {10.2307/2528419} } -@Article{Wilde:1988, - Title = {Nicht w{\"{o}}rtlich genommen, 'Schreibweisentolerante' Suchroutine in dBASE implementiert}, - Author = {Georg, Wilde and Meyer, Carsten}, - Journal = {c't Magazin für Computer Technik}, - Year = {1988}, +@Article{Goodman:1963, + Title = {Measures of Association for Cross Classification III: Approximate Sampling Theory}, + Author = {Goodman, {Leo A.} and Kruskal, {William H.}}, + Journal = {Journal of the American Statistical Association}, + Year = {1963}, + Number = {302}, + Pages = {310--364}, + Volume = {55}, - Month = oct, - Number = {10}, - Pages = {126--131} + Doi = {10.1080/01621459.1963.10500850} } -@InProceedings{Gill:1997, - Title = {OX-LINK: The Oxford Medical Record Linkage System}, - Author = {Gill, {Leicester E.}}, - Booktitle = {Record Linkage Techniques}, - Year = {1997}, +@Article{Goodman:1959, + Title = {Measures of Association for Cross Classification II: Further Discussion and References}, + Author = {Goodman, {Leo A.} and Kruskal, {William H.}}, + Journal = {Journal of the American Statistical Association}, + Year = {1959}, - Address = {Washington, D.C.}, - Month = mar, - Organization = {Federal Committee on Statistical Methodology}, - Publisher = {Office of Management and Budget}, + Month = {{mar}}, + Number = {285}, + Pages = {123--163}, + Volume = {54}, - Url = {https://pdfs.semanticscholar.org/fff7/02a3322e05c282a84064ee085e589ef74584.pdf} + Doi = {10.2307/2282143} +} + +@Article{Goodman:1954, + Title = {Measures of Association for Cross Classification I}, + Author = {Goodman, {Leo A.} and Kruskal, {William H.}}, + Journal = {Journal of the American Statistical Association}, + Year = {1954}, + Number = {268}, + Pages = {732--764}, + Volume = {49}, + + Doi = {10.2307/2281536} } @Article{Gotoh:1982, @@ -397,6 +1149,41 @@ @Article{Gotoh:1982 Url = {http://www.sciencedirect.com/science/article/pii/0022283682903989} } +@Article{Gower:1971, + Title = {A General Coefficient of Similarities and Some of Its Properties}, + Author = {Gower, {John C.}}, + Journal = {Biometrics}, + Year = {1971}, + + Month = {{dec}}, + Number = {4}, + Pages = {857--871}, + Volume = {27}, + + Doi = {10.2307/2528823} +} + +@Article{Gower:1986, + Title = {Metric and {Euclid}ean Properties of Dissimilarity Coefficients}, + Author = {Gower, {John C.} and Legendre, Pierre}, + Journal = {Journal of Classification}, + Year = {1986}, + + Month = {{feb}}, + Number = {1}, + Pages = {5--48}, + Volume = {3}, + + Doi = {10.1007/BF01896809} +} + +@InProceedings{Gravano:2001, + Title = {Approximate String Joins in a Database (Almost) for Free}, + Author = {Gravano, Luis and Ipeirotis, {Panagiotis G.} and Jagadish, {H. V.} and Koudas, Nick and Muthukrishman, {S.} and Srivastava, Divesh}, + Booktitle = {Proceedings of the 27th VLDB Conference, Roma, Italy, 2001}, + Year = {2001} +} + @InProceedings{Gross:1991, Title = {Getty Synoname: The Development of Software for Personal Name Pattern Matching}, Author = {Gross, {Aaron D.}}, @@ -414,6 +1201,42 @@ @InProceedings{Gross:1991 Url = {http://dl.acm.org/citation.cfm?id=3171004.3171021} } +@Book{Guilford:1956, + Title = {Fundamental Statistics in Psychology and Education}, + Author = {Guildford, {J. P.}}, + Publisher = {McGraw-Hill Book Company}, + Year = {New York}, + + Address = {New York}, + + Url = {https://archive.org/details/in.ernet.dli.2015.228996} +} + +@InCollection{Guttman:1941, + Title = {An Outline of the Statistical Theory of Prediction}, + Author = {Guttman, Louis}, + Booktitle = {The Prediction of Personal Adjustment}, + Publisher = {Social Science Research Council}, + Year = {1941}, + Editor = {Horst, Paul}, + Number = {48}, + Pages = {253--311}, + + Url = {https://babel.hathitrust.org/cgi/pt?id=uc1.b4579784;view=1up;seq=271} +} + +@Article{Gwet:2008, + Title = {Computing inter-rater reliability and its variance in the presence of high agreement}, + Author = {Gwet, {Kilem Li}}, + Journal = {British Journal of Mathematical and Statistical Psychology}, + Year = {2008}, + Number = {1}, + Pages = {29--48}, + Volume = {61}, + + Doi = {10.1348/000711006X126600} +} + @Misc{Haase:2000, Title = {Die Erweiterte K{\"{o}}lner Phonetik}, @@ -421,6 +1244,15 @@ @Misc{Haase:2000 Year = {2000} } +@Article{Hamann:1961, + Title = {Merkmalbestand und Verwandtschaftsbeziehungen der Farinosae: ein Beitrag zum System der Monokotyledonen}, + Author = {Hamann, Ulrich}, + Journal = {Willdenowia}, + Year = {1961}, + Pages = {639--768}, + Volume = {2} +} + @Article{Hamming:1950, Title = {Error detecting and error correcting codes}, Author = {Hamming, {R. W.}}, @@ -450,6 +1282,51 @@ @Article{Harman:1991 Url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.9828&rep=rep1&type=pdf} } +@Article{Harris:1978, + Title = {A Method for Combining Occurrence and Nonoccurrence Interobserver Agreement Scores}, + Author = {Harris, {Francis C.} and Lahey, {Benjamin B.}}, + Journal = {Journal of Applied Behavior Analysis}, + Year = {1978}, + Number = {4}, + Pages = {523--527}, + Volume = {11}, + + Doi = {10.1901/jaba.1978.11-523} +} + +@Article{Hassanat:2014, + Title = {Dimensionality Invariant Similarity Measure}, + Author = {Hassanat, {Ahmad Basheer}}, + Journal = {Journal of American Science}, + Year = {2014}, + Number = {8}, + Pages = {221--226}, + Volume = {10}, + + Url = {https://arxiv.org/abs/1409.0923} +} + +@TechReport{Hawkins:1973, + Title = {Reliability Scores That Delude: An Alice in Wonderland Trip Through the Misleading Characteristics of Inter-Observer Agreement Scores in Interval Recording}, + Author = {Hawkins, {Robert P.} and Dotson, {Victor A.}}, + Institution = {Western Michigan University}, + Year = {1973}, + + Url = {https://eric.ed.gov/?id=ED094277} +} + +@Article{Hellinger:1909, + Title = {Neue Begr{\"{u}}ndung der Theorie quadratischer Formen von unendlichvielen Ver{\"{a}}nderlichen}, + Author = {Hellinger, Ernst}, + Journal = {Journal Für Die Reine Und Angewandte Mathematik}, + Year = {1909}, + Number = {136}, + Pages = {210--271}, + Volume = {1909}, + + Doi = {10.1515/crll.1909.136.210} +} + @Article{Henry:1976, Title = {Projet de transcription phon{\'{e}}tique des noms de famille}, Author = {Henry, Louis}, @@ -527,6 +1404,44 @@ @TechReport{Hood:2002 Url = {https://caversham.otago.ac.nz/files/working/ctp060902.pdf} } +@Article{Hubalek:1982, + Title = {Coefficients of Association and Similarity, Based on Binary (Presence-Absence) Data: An Evaluation}, + Author = {Hub{\'{a}}lek, Zdenek}, + Journal = {Biological Reviews}, + Year = {2008}, + + Month = {{feb}}, + Number = {4}, + Pages = {669--689}, + Volume = {57}, + + Doi = {10.1111/j.1469-185X.1982.tb00376.x} +} + +@Article{Hurlbert:1969, + Title = {A Coefficient of Interspecific Assciation}, + Author = {Hurlbert, {Stuart H.}}, + Journal = {Ecology}, + Year = {1969}, + + Month = {{jan}}, + Number = {1}, + Pages = {1--9}, + Volume = {50}, + + Doi = {10.2307/1934657} +} + +@Book{IBM:2017, + Title = {{IBM SPS}S Statistics Algorithms}, + Author = {{IBM Corporation}}, + Publisher = {{IBM Corporation}}, + Year = {2017}, + Edition = {25}, + + Url = {ftp://public.dhe.ibm.com/software/analytics/spss/documentation/statistics/subscription/en/client/Manuals/IBM_SPSS_Statistics_Algorithms.pdf} +} + @Manual{IBM:1973, Title = {Alpha Search Inquiry System, General Information Manual}, @@ -594,6 +1509,96 @@ @InProceedings{Jiminez:2013 Url = {http://www.aclweb.org/anthology/S13-1028} } +@Article{Johnson:1967, + Title = {Hierarchical Clustering Schemes}, + Author = {Johnson, {Stephen C.}}, + Journal = {Psychometrika}, + Year = {1967}, + + Month = {{sep}}, + Number = {3}, + Pages = {241--254}, + Volume = {32}, + + Doi = {10.1007/BF02289588} +} + +@InProceedings{Jones:2005, + Title = {Empirical Evaluation of the Tarantula Automatic Fault-Localization Technique}, + Author = {Jones, {James A.} and Harrold, {Mary Jean}}, + Booktitle = {ASE '05 Proceedings of the 20th IEEE/ACM international Conference on Automated software engineering}, + Year = {2005}, + + Address = {New York}, + Month = {{nov}}, + Organization = {ACM}, + Pages = {273--282}, + Publisher = {ACM}, + + Doi = {10.1145/1101908.1101949}, + ISBN = {1-58113-993-4}, + Location = {Long Beach, CA, USA} +} + +@Article{Koppen:1884, + Title = {Eine Rationelle Methode zur Pr{\"{u}}fung der Wetterprognosen}, + Author = {K{\"{o}}ppen, Wladimir}, + Journal = {Meteorologische Zeitschrif}, + Year = {1884}, + Volume = {1} +} + +@InCollection{Koppen:1870, + Title = {Die Aufeinanderfolge der Periodischen Witterungserscheinungen nach den Grunds{\"{a}}tzen der Wahrscheinlichkeitsrechnung}, + Author = {K{\"{o}}ppen, Wladimir}, + Booktitle = {Repertorium für Meteorologie}, + Publisher = {Akademiia Nauk}, + Year = {1870}, + Pages = {189--238}, + Volume = {2}, + + Url = {https://books.google.com/books?id=1ww0AQAAMAAJ&pg=RA1-PA187#v=onepage&q&f=false} +} + +@MastersThesis{Kempken:2005, + Title = {Bewertung historischer und regionaler Schreibvarianten mit Hilfe von Abstandsmaßen}, + Author = {Kempken, Sebastian}, + School = {Universität Duisburg-Essen}, + Year = {2005}, + Month = {{dec}}, + + Url = {https://duepublico.uni-duisburg-essen.de/servlets/DerivateServlet/Derivate-17252/BewertungSchreibvarianten.pdf} +} + +@Article{Kendall:1938, + Title = {A New Measure of Rank Correlation}, + Author = {Kendall, {Maurice G.}}, + Journal = {Biometrika}, + Year = {1938}, + + Month = {{jun}}, + Number = {1/2}, + Pages = {81--93}, + Volume = {30}, + + Doi = {10.2307/2332226} +} + +@InCollection{Kent:1977, + Title = {Direct Observational Procedure: Methodological Issues in Naturalistic Settings}, + Author = {Kent, {Ronald N.} and Foster, {Sharon L.}}, + Booktitle = {Handbook of Behavioral Assessment}, + Publisher = {John Wiley \& Sons}, + Year = {1977}, + + Address = {New York}, + Chapter = {9}, + Editor = {Ciminero, {Anthony R.} and Calhoun, {Karen, S.} and Adams, {Henry E.}}, + Pages = {279--328}, + + Url = {https://archive.org/details/handbookofbehavi00cimi} +} + @InBook{Knuth:1998, Title = {The Art of Computer Programming: Volume 3, Sorting and Searching}, Author = {Knuth, {Donald E.}}, @@ -610,10 +1615,46 @@ @Misc{Kollar:2007 Url = {https://github.com/maros/Text-Phonetic/blob/master/lib/Text/Phonetic/Phonix.pm} } +@PhdThesis{Kondrak:2002, + Title = {Algorithms for Language Reconstruction}, + Author = {Kondrak, Grzegorz}, + School = {University of Toronto}, + Year = {2002}, + + Url = {https://webdocs.cs.ualberta.ca/~kondrak/papers/thesis.pdf} +} + +@InProceedings{Kondrak:2000, + Title = {A New Algorithm for the Alignment of Phonetic Sequences}, + Author = {Kondrak, Grzegorz}, + Booktitle = {NAACL 2000 Proceedings of the 1st North American chapter of the Association for Computational Linguistics conference}, + Year = {2000}, + + Doi = {10.0000/dl.acm.org/974343} +} + +@InProceedings{Kondrak:2004, + Title = {Identification of Confusable Drug Names: A New Approach and Evaluation Methodology}, + Author = {Kondrak, Grzegorz and Door, {Bonnie J.}}, + Booktitle = {Proceedings of the 20th International Conference on Computational Linguistics - COLING ’04}, + Year = {2004}, + + Doi = {10.3115/1220355.1220492} +} + +@TechReport{Kondrak:2003, + Title = {A Similarity-Based Approach and Evaluation Methodology for Reduction of Drug Name Confusion}, + Author = {Kondrak, Grzegorz and Dorr, {Bonnie J.}}, + Institution = {University of Maryland, Institute for Advanced Computer Studies}, + Year = {2003}, + + Url = {https://apps.dtic.mil/dtic/tr/fulltext/u2/a452242.pdf} +} + @InProceedings{Koneru:2017, Title = {Privacy Preserving Record Linkage Using MetaSoundex Algorithm}, Author = {Koneru, Kerrthi and Varol, Cihan}, - Booktitle = {2017 16\textsuperscript{th} IEEE International Conference on Machine Learning and Applications (ICMLA)}, + Booktitle = {2017 16th IEEE International Conference on Machine Learning and Applications (ICMLA)}, Year = {2017}, Month = dec, Pages = {443--447}, @@ -622,6 +1663,20 @@ @InProceedings{Koneru:2017 Url = {https://ieeexplore.ieee.org/document/8260671/} } +@Article{Kuder:1937, + Title = {The Theory of the Estimation of Test Reliability}, + Author = {Kuder, {G. Frederic} and Richardson, {Marion Webster}}, + Journal = {Psychometrika}, + Year = {1937}, + + Month = {{sep}}, + Number = {3}, + Pages = {151--160}, + Volume = {2}, + + Doi = {10.1007/bf02288391} +} + @Misc{Kuhn:1995, Title = {Metaphone searches}, @@ -632,6 +1687,48 @@ @Misc{Kuhn:1995 Url = {http://aspell.net/metaphone/metaphone-kuhn.txt} } +@InProceedings{Kuhns:1965, + Title = {The Continuum of Coefficients of Association}, + Author = {Kuhns, {John L.}}, + Booktitle = {Statistical Association Methods for Mechanized Documentation}, + Year = {1964}, + Editor = {Stevens, {Mary Elizabeth} and Giuliano, {Vincent E.} and Heilprin, {Laurence B.}}, + Number = {269}, + Pages = {33--40}, + Series = {National Bureau of Standards Miscellaneous Publication} +} + +@Misc{Kula:2015, + Title = {Simple MinHash implementation in Python}, + + Author = {Kula, Maciej}, + Month = {{jun}}, + Year = {2015}, + + Url = {https://maciejkula.github.io/2015/06/01/simple-minhash-implementation-in-python/} +} + +@Article{Kulczynski:1927, + Title = {Die Pflanzenassoziationen der Pieninen}, + Author = {Kulczyński, Stanis{\l}aw}, + Journal = {Bulletin International de l’Academie Polonaise des Sciences et des Lettres, Classe des Sciences Mathematiques et Naturelles, B (Sciences Naturelles)}, + Year = {1927}, + Pages = {57--203} +} + +@Article{Kullback:1951, + Title = {On Information and Sufficiency}, + Author = {Kullback, {S. A.} and Leibler, {R. A.}}, + Journal = {The Annals of Mathematical Statistics}, + Year = {1951}, + Number = {1}, + Pages = {79--86}, + Volume = {22}, + + Doi = {10.1214/aoms/1177729694}, + Url = {https://projecteuclid.org/euclid.aoms/1177729694} +} + @Article{Legare:1972, Title = {The Early Canadian Population: Problems in Automatic Record Linkage}, Author = {L{\'{e}}gar{\'{e}}, Jacques and Lavoie, Yolande and Charbonneau, Hubert}, @@ -646,6 +1743,15 @@ @Article{Legare:1972 Doi = {10.3138/CHR-053-04-03} } +@Book{Ladefoged:1995, + Title = {A Course in Phonetics}, + Author = {Ladefoged, Peter}, + Publisher = {Harcourt Brace Jovanovich}, + Year = {1995}, + + Address = {New York} +} + @TechReport{Lait:1996, Title = {An Assessment of Name Matching Algorithms}, Author = {Lait, {Andrew J.} and Randell, Brian}, @@ -657,6 +1763,41 @@ @TechReport{Lait:1996 Url = {http://homepages.cs.ncl.ac.uk/brian.randell/Genealogy/NameMatching.pdf} } +@Article{Lance:1967, + Title = {A general theory of classificatory sorting strategies. II. Clustering systems.}, + Author = {Lance, {Godfrey N.} and Williams, {William T.}}, + Journal = {Computer Journal}, + Year = {1967}, + + Month = {{jan}}, + Number = {3}, + Pages = {271--277}, + Volume = {10}, + + Doi = {10.1093/comjnl/10.3.271}, + Url = {https://academic.oup.com/comjnl/article-pdf/10/3/271/1333425/100271.pdf} +} + +@Article{Lance:1967b, + Title = {Mixed-data classificatory programs I. Agglomerative systems}, + Author = {Lance, {Godfrey N.} and Williams, {William T.}}, + Journal = {Australian Computer Journal}, + Year = {1967}, + Pages = {15--20}, + Volume = {1} +} + +@Article{Lance:1966, + Title = {Computer programs for hierarchical polythetic classification ("similarity analysis")}, + Author = {Lance, {Godfrey N.} and Williams, {William T.}}, + Journal = {Computer Journal}, + Year = {1966}, + Number = {1}, + Volume = {9}, + + Doi = {10.1093/comjnl/9.1.60} +} + @Misc{Lang:2013, Title = {Inner wworking of the German Analyzer in Lucene}, @@ -667,6 +1808,18 @@ @Misc{Lang:2013 Url = {http://www.evelix.ch/unternehmen/Blog/evelix/2013/11/11/inner-workings-of-the-german-analyzer-in-lucene} } +@Book{Legendre:1998, + Title = {Numerical Ecology}, + Author = {Legendre, Pierre and Legendre, Louis}, + Publisher = {Elsevier}, + Year = {1998}, + + Address = {Amsterdam}, + Edition = {2nd}, + Number = {20}, + Series = {Developments in Environmental Modelling} +} + @Article{Levenshtein:1966, Title = {Binary codes capable of correcting deletions, insertions, and reversals}, Author = {Levenshtein, {Vladimir I.}}, @@ -693,6 +1846,26 @@ @Article{Levenshtein:1965 Url = {http://mi.mathnet.ru/dan31411} } +@InProceedings{Lin:2004, + Title = {ROUGE: A Package for Automatic Evaluation of Summaries}, + Author = {Lin, Chin-Yew}, + Booktitle = {Text Summarization Branches Out}, + Year = {2004}, + + Url = {http://aclweb.org/anthology/W04-1013} +} + +@Article{Lodhi:2002, + Title = {Text Classification using String Kernels}, + Author = {Lodhi, Huma and Saunders, Craig and {Shawe-Taylor}, John and Cristianini, Nello and Watkins, Chris}, + Journal = {Journal of Machine Learning Research}, + Year = {2002}, + Pages = {419--444}, + Volume = {2}, + + Doi = {10.1162/153244302760200687} +} + @Article{Lovins:1968, Title = {Development of a Stemming Algorithm}, Author = {Lovins, {Julie Beth}}, @@ -729,9 +1902,21 @@ @Misc{Marcelino:2015 Url = {https://github.com/danielmarcelino/SoundexBR} } +@Article{Maron:1960, + Title = {On Relevance, Probabilistic Indexing and Information Retrieval}, + Author = {Maron, {Melvin E.} and Kuhns, {John L.}}, + Journal = {Journal of the ACM}, + Year = {1960}, + Number = {3}, + Pages = {216--244}, + Volume = {7}, + + Doi = {10.1145/321033.321035} +} + @Article{Matthews:1975, Title = {Comparison of the predicted and observed secondary structure of T4 phage lysozyme}, - Author = {Matthews, Brian W.}, + Author = {Matthews, {Brian W.}}, Journal = {Biochimica et Biophysica Acta (BBA) - Protein Structure}, Year = {1975}, Number = {2}, @@ -739,6 +1924,65 @@ @Article{Matthews:1975 Volume = {405} } +@Article{Matusita:1955, + Title = {Decision Rules, Based on the Distance, for Problems of Fit, Two Samples, and Estimation}, + Author = {Matusita, Kameo}, + Journal = {The Annals of Mathematical Statistics}, + Year = {1955}, + + Month = {{dec}}, + Number = {4}, + Pages = {631--640}, + Volume = {26}, + + Doi = {10.2307/2236376} +} + +@Article{Maxwell:1968, + Title = {Deriving Coefficients of Reliability and Agreement for Ratings}, + Author = {Maxwell, {A. E.} and Pilliner, {A. E. G.}}, + Journal = {The British Journal of Mathematical and Statistical Psychology}, + Year = {1968}, + + Month = {{may}}, + Number = {1}, + Pages = {105--116}, + Volume = {21}, + + Doi = {10.1111/j.2044-8317.1968.tb00401.x} +} + +@Article{McConnaughey:1964, + Title = {The Determination and Analysis of Plankton Communities}, + Author = {McConnaughey, {Bayard H.}}, + Journal = {Lembaga Penelitian Laut}, + Year = {1964}, + Number = {Special Number}, + Pages = {1--40} +} + +@Article{Merity:2016, + Title = {Pointer Sentinel Mixture Models}, + Author = {Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, + Journal = {CoRR}, + Year = {2016}, + Volume = {abs/1609.07843}, + + Url = {http://arxiv.org/abs/1609.07843} +} + +@Article{Michael:1920, + Title = {Marine Ecology and the Coefficient of Association: A Plea in Behalf of Quantitative Biology}, + Author = {Michael, {Ellis L.}}, + Journal = {The Journal of Ecology}, + Year = {1920}, + Number = {1}, + Pages = {54--59}, + Volume = {8}, + + Doi = {10.2307/2255213} +} + @Misc{Michael:2007, Title = {phonet.c}, @@ -810,6 +2054,24 @@ @Book{Moore:1977 Url = {https://archive.org/details/accessingindivid00moor} } +@InProceedings{Moreau:2008, + Title = {Robust Similarity Measures for Named Entities Matching}, + Author = {Moreau, Erwan and Yvon, François and Cappé, Olivier}, + Booktitle = {COLING '08 Proceedings of the 22nd International Conference on Computational Linguistics - Volume 1}, + Year = {2008}, + Month = {{aug}}, + Pages = {593--600} +} + +@PhdThesis{Morris:2012, + Title = {A Quantitative MethoMethod for Vetting "Dark Network" Intelligence Sources for Social Network Analysis}, + Author = {Morris, {James F.}}, + School = {Air Force Institute of Technology}, + Year = {2012}, + + Url = {https://apps.dtic.mil/dtic/tr/fulltext/u2/a561702.pdf} +} + @InProceedings{Mosquera:2012, Title = {Towards Facilitating the Accessibility of Web 2.0 {T}exts through Text Normalisation}, Author = {Mosquera, Alejandro and Lloret, Elena and Moreda, Paloma}, @@ -817,7 +2079,67 @@ @InProceedings{Mosquera:2012 Year = {2012}, Pages = {9--14}, - Url = {http://www.taln.upf.edu/pages/nlp4ita/pdfs/mosquera-nlp4ita2012.pdf} + Url = {http://www.taln.upf.edu/pages/nlp4ita/pdfs/mosquera-nlp4ita2012.pdf} +} + +@Article{Motyka:1950, + Title = {Wstçpne badania nad lakami paludniowo-wschodnilj Lubel-szczyzny (Preliminary studies on meadows in the south-east of the province Lublin)}, + Author = {Motyka, J. and Dobrzański, B. and Zawadzki, S.}, + Journal = {Annales Universitatis Mariae Curie-Skłodowska, Sectio E}, + Year = {1950}, + Number = {13}, + Pages = {367--447}, + Volume = {5} +} + +@InProceedings{Mountford:1962, + Title = {An Index of Similarity and Its Application to Classificatory Problems}, + Author = {Mountford, {M. D.}}, + Booktitle = {Progress in Soil Zoology: Papers from a Colloquium on Research Methods Organized by the Soil Zoology Committee of the International Society of Soil Science}, + Year = {1962}, + + Address = {London}, + Editor = {Murphy, {P. W.}}, + Month = {{jul}}, + Pages = {43--50}, + Publisher = {Butterworths}, + + Url = {https://openlibrary.org/books/OL5908681M/Progress_in_soil_zoology} +} + +@Article{Mozley:1936, + Title = {The Statistical Analysis of the Distribution of Pond Molluscs in Western {Canada}}, + Author = {Mozley, Alan}, + Journal = {The American Naturalist}, + Year = {1936}, + Number = {728}, + Volume = {70}, + + Doi = {10.1086/280660} +} + +@Article{Munkres:1957, + Title = {Algorithms for the Assignment and Transportation Problems}, + Author = {Munkres, James}, + Journal = {Journal of the Society for Industrial and Applied Mathematics}, + Year = {1957}, + + Month = {{mar}}, + Number = {1}, + Pages = {32--38}, + Volume = {5}, + + Doi = {10.1137/0105003} +} + +@InProceedings{Naseem:2011, + Title = {Improved Similarity Measures For Software Clustering}, + Author = {Naseem, Rashid and Maqbool, Onaiza and Muhammad, Siraj}, + Booktitle = {Proceedings of the Euromicro Conference on Software Maintenance and Reengineering, CSMR}, + Year = {2011}, + Month = {{mar}}, + + Doi = {10.1109/CSMR.2011.9} } @Article{Navarro:2001, @@ -868,6 +2190,18 @@ @Article{Ochiai:1957 Url = {https://www.jstage.jst.go.jp/article/suisan1932/22/9/22_9_526/_pdf/-char/en} } +@InProceedings{On:2007, + Title = {Group Linkage}, + Author = {On, {Byung Won} and Koudas, Nick and Lee, Dongwon and Srivastava, Divesh}, + Booktitle = {23rd International Conference on Data Engineering, ICDE 2007}, + Year = {2007}, + + Address = {Istanbul, Turkey}, + Pages = {496--505}, + + Doi = {10.1109/ICDE.2007.367895} +} + @Misc{OpenRefine:2012, Title = {Clustering In Depth}, @@ -878,6 +2212,20 @@ @Misc{OpenRefine:2012 Url = {https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth} } +@Article{Orloci:1967, + Title = {An Agllomerative Method for Classification of Plant Communities}, + Author = {Orl{\'{o}}ci, Laszlo}, + Journal = {The Journal of Ecology}, + Year = {1967}, + + Month = {{mar}}, + Number = {1}, + Pages = {193--206}, + Volume = {55}, + + Doi = {10.2307/2257725} +} + @Article{Otsuka:1936, Title = {The faunal character of the {Japan}ese Pleistocene marine Mollusca, as evidence of the climate having become colder during the Pleistocene in {Japan}}, Author = {Otsuka, Yanosuke}, @@ -888,6 +2236,15 @@ @Article{Otsuka:1936 Volume = {6} } +@Misc{Ozbay:2015, + Title = {Ozbay metric}, + + Author = {Ozbay, Hakan}, + Year = {2015}, + + Url = {https://github.com/hakanozbay/ozbay-metric} +} + @InProceedings{Paice:1990, Title = {Another stemmer}, Author = {Paice, {Chris D.}}, @@ -902,6 +2259,16 @@ @InProceedings{Paice:1990 Url = {https://dl.acm.org/citation.cfm?id=101310} } +@InProceedings{Papineni:2002, + Title = {BLEU: A Method for Automatic Evaluation of Machine Translation}, + Author = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, {Wei-Jing}}, + Booktitle = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL), Philadelphia, July 2002}, + Year = {2002}, + Pages = {311--318}, + + Url = {https://www.aclweb.org/anthology/P02-1040.pdf} +} + @Article{Parmar:2014, Title = {Study Existing Various Phonetic Algorithms and Designing and Development of a working model for the New Developed Algorithm and Comparison by implementing ti with Existing Algorithm(s)}, Author = {Parmar, {Vimal P.} and Kumbharana, {CK}}, @@ -914,6 +2281,76 @@ @Article{Parmar:2014 Doi = {10.5120/17295-7795} } +@InProceedings{Passonneau:2006, + Title = {Measuring Agreement on Set-valued Items (MASI) for Semantic and Pragmatic Annotation}, + Author = {Passonneau, Rebecca}, + Booktitle = {Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC'06)}, + Year = {2006}, + Month = {{may}}, + Pages = {831--836} +} + +@Article{Pearson:1900, + Title = {Mathematical contributions to the theory of evolution. VII. On the correlation of characters not quantitatively measurable}, + Author = {Pearson, Karl}, + Journal = {Philosophical Transactions of the Royal Society}, + Year = {1900}, + Pages = {1--47}, + Volume = {195 A}, + + Doi = {10.1098/rsta.1900.0022} +} + +@Article{Pearson:1913, + Title = {On Theories of Association}, + Author = {Pearson, Karl and Heron, David}, + Journal = {Biometrika}, + Year = {1913}, + Number = {1/2}, + Pages = {159--315}, + Volume = {9}, + + Doi = {10.2307/2331805} +} + +@Article{Pecina:2010, + Title = {Lexical association measures and collocation extraction}, + Author = {Pecina, Pavel}, + Journal = {Language Resources \& Evaluation}, + Year = {2010}, + Number = {1/2}, + Pages = {137--158}, + Volume = {44}, + + Doi = {10.2307/40666353} +} + +@Article{Peirce:1884, + Title = {The Numerical Measure of the Success of Predictions}, + Author = {Peirce, {Charles S.}}, + Journal = {Science}, + Year = {1884}, + Number = {93}, + Pages = {453--454}, + Volume = {4}, + + Doi = {10.1126/science.ns-4.93.453-a} +} + +@Article{Penrose:1952, + Title = {Distance, Size and Shape}, + Author = {Penrose, {Lionel S.}}, + Journal = {Annals of Eugenics}, + Year = {1952}, + + Month = {{jan}}, + Number = {1}, + Pages = {337--343}, + Volume = {17}, + + Doi = {10.1111/j.1469-1809.1952.tb02527.x} +} + @Misc{Pfeifer:2000, Title = {WAIT 1.8 - soundex.c}, @@ -948,7 +2385,7 @@ @Misc{Philips:1990 @Article{Philips:1990b, Title = {Hanging on the Metaphone}, Author = {Philips, Lawrence}, - Journal = {Computer Language Magazine}, + Journal = {Computer Language}, Year = {1990}, Month = dec, @@ -1031,9 +2468,9 @@ @Misc{Prante:2015 Url = {https://github.com/elastic/elasticsearch/blob/master/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java} } -@Article{Raissouli:1991, +@Article{Raissouli:2009, Title = {Arithmetic-Geometric-Harmonic Mean of Three Positive Operators}, - Author = {Ra{\"{\i}}ssouli, Mustapha and Leazizi,Fatima and Chergui, Mohamed}, + Author = {Ra{\"{i}}ssouli, Mustapha and Leazizi,Fatima and Chergui, Mohamed}, Journal = {Journal of Inequalities in Pure and Applied Mathematics}, Year = {2009}, Number = {4}, @@ -1042,6 +2479,29 @@ @Article{Raissouli:1991 Url = {http://www.emis.de/journals/JIPAM/images/014_08_JIPAM/014_08.pdf} } +@TechReport{Radev:2001, + Title = {Evaluation of Text Summarization in a Cross-lingual Information Retrieval Framework}, + Author = {Radev, Dragomir and Teufel, Simone and Saggion, Horacio and Lam, Wai and Blitzer, John and Çelebi, Arda and Qi, Hong and Drabek, Elliott and Liu, Danyu}, + Institution = {Johns Hopkins}, + Year = {2001}, + + Url = {https://pdfs.semanticscholar.org/44a1/df62a1c815fc84aa42788283655a38c85550.pdf} +} + +@Article{Rand:1971, + Title = {Objective Criteria for the Evaluation of Clustering Methods}, + Author = {Rand, {William M.}}, + Journal = {Journal of the American Statistical Association}, + Year = {1971}, + + Month = {{dec}}, + Number = {336}, + Pages = {846--850}, + Volume = {66}, + + Doi = {10.2307/2284239} +} + @Article{Ratcliff:1988, Title = {Pattern Matching: the Gestalt Approach}, Author = {Ratcliff, {John W.} and Metzener, {David E.}}, @@ -1051,6 +2511,30 @@ @Article{Ratcliff:1988 Url = {http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970} } +@Article{Rees:2014, + Title = {Taxamatch, an Algorithm for Near ('Fuzzy') Matching on Scientific Names in Taxonomic Databases}, + Author = {Rees, Tony}, + Journal = {PLoS ONE}, + Year = {2014}, + + Month = {{sep}}, + Number = {9}, + Pages = {1--27}, + Volume = {9}, + + Doi = {10.1371/journal.pone.0107510} +} + +@Misc{Rees:2013, + Title = {The MDLD (Modified Damerau-Levenshtein Distance) Algorithm}, + + Author = {Rees, Tony and Boehmer, Barbara}, + Month = {{nov}}, + Year = {2013}, + + Url = {https://confluence.csiro.au/public/taxamatch/the-mdld-modified-damerau-levenshtein-distance-algorithm} +} + @Misc{Repici:2013, Title = {Understanding Classic SoundEx Algorithms}, @@ -1076,6 +2560,18 @@ @InProceedings{Ring:2009 ISBN = {978-3-642-04769-5} } +@Article{Roberts:1986, + Title = {Ordination on the basis of fuzzy set theory}, + Author = {Roberts, {David W.}}, + Journal = {Vegetatio}, + Year = {1986}, + Number = {3}, + Pages = {123--131}, + Volume = {66}, + + Doi = {10.1007/BF00039905} +} + @InProceedings{Robinson:1967, Title = {Results of a prototype television bandwidth compression scheme}, Author = {Robinson, {A. H.} and Cherry, Colin}, @@ -1089,6 +2585,80 @@ @InProceedings{Robinson:1967 Doi = {10.1109/PROC.1967.5493} } +@Article{Robinson:1951, + Title = {A Method for Chronologically Ordering Archaeological Deposits}, + Author = {Robinson, {W. S.}}, + Journal = {American Antiquity}, + Year = {1951}, + + Month = {{apr}}, + Number = {4}, + Pages = {293--301}, + Volume = {16}, + + Doi = {10.2307/276978} +} + +@Article{Rogers:1960, + Title = {A Computer Program for Classifying Plants}, + Author = {Rogers, {David J.} and Tanimoto, {Taffee T.}}, + Journal = {Science}, + Year = {1960}, + + Month = {{oct}}, + Number = {3434}, + Pages = {1115--1118}, + Volume = {132}, + + Doi = {10.1126/science.132.3434.1115} +} + +@Article{Rogot:1966, + Title = {A proposed index for measuring agreement in test-retest studies}, + Author = {Rogot, Eugene and Goldberg, {Irving D.}}, + Journal = {Journal of Chronic Diseases}, + Year = {1966}, + + Doi = {10.1016/0021-9681(66)90032-4} +} + +@Misc{rosettacode:2018, + Title = {Run-length encoding}, + + Author = {{Rosetta Code}}, + Year = {2018}, + + Url = {https://rosettacode.org/wiki/Run-length_encoding#Python} +} + +@Misc{rosettacode:2018b, + Title = {Longest common subsequence}, + + Author = {{Rosetta Code}}, + Year = {2018}, + + Url = {http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6} +} + +@Article{Ruzicka:1958, + Title = {Anwendung mathematische-statistischer Methoden in der Geobotanik (Synthetische Bearbeitung von Aufnahmen)}, + Author = {Rů{\v{z}}i{\v{c}}ka, M.}, + Journal = {Biologia, Bratislava}, + Year = {1958}, + Pages = {647--661}, + Volume = {13} +} + +@InProceedings{Ruibin:2005, + Title = {An Adaptive Model for Phonetic String Search}, + Author = {Ruibin, Gong and Yun, {Chan Kai}}, + Booktitle = {Knowledge-Based Intelligent Information and Engineering Systems, 9th International Conference, KES 2005 Melbourne, Australia, September 14-16, 2005 Proceedings, Part III}, + Year = {2005}, + Pages = {915--921}, + Series = {Lecture Notes in Artificial Intelligence}, + Volume = {3683} +} + @Misc{Rukasz:2018, Title = {PPRL -- Privacy Preserving Record Linkage}, @@ -1098,6 +2668,26 @@ @Misc{Rukasz:2018 Url = {https://github.com/cran/PPRL} } +@InProceedings{Russ:2014, + Title = {Computer-Based Coding of Occupation Codes for Epidemiological Analysis}, + Author = {Russ, {Daniel E.} and Ho, Kwan-Yuet and Johnson, {Calvin A.} and Friesen, {Melissa C.}}, + Booktitle = {2014 IEEE 27th International Symposium on Computer-Based Medical Systems}, + Year = {2014}, + Pages = {347--350}, + + Doi = {10.1109/CBMS.2014.79} +} + +@Article{Russell:1940, + Title = {On Habitat and Association of Species of Anopheline Larvae in South-Eastern Madras}, + Author = {Russell, {Paul F.} and Rao, {T. Ramachandra}}, + Journal = {Journal of the Malaria Institute of India}, + Year = {1940}, + Number = {1}, + Pages = {153--178}, + Volume = {3} +} + @Misc{Russell:1917, Title = {Index}, @@ -1157,6 +2747,18 @@ @Article{Schnell:2004 Url = {https://pdfs.semanticscholar.org/2353/21c24ed0401cd05d7752c2c8a8da5b7a4dc0.pdf} } +@Article{Scott:1955, + Title = {Reliability of Content Analysis: The Case of Nominal Scale Coding}, + Author = {Scott, {William A.}}, + Journal = {Public Opinion Quarterly}, + Year = {1955}, + Number = {3}, + Pages = {321--325}, + Volume = {19}, + + Doi = {10.1086/266577} +} + @Article{Seiffert:1993, Title = {Problem 887}, Author = {Seiffert, Heinz-J{\"{u}}rgen}, @@ -1167,6 +2769,15 @@ @Article{Seiffert:1993 Volume = {11} } +@Misc{SequentiX:2018, + Title = {Distance Measures}, + + Author = {SequentiX}, + Year = {2018}, + + Url = {https://www.sequentix.de/gelquest/help/distance_measures.htm} +} + @Article{Shannaq:2010, Title = {Using Product Similarity for Adding Business}, Author = {Shannaq, {Boumedyen A. N.} and Alexandrov, {Victor V.}}, @@ -1181,6 +2792,44 @@ @Article{Shannaq:2010 Url = {https://www.sial.iias.spb.su/files/386-386-1-PB.pdf} } +@Article{Shapira:2007, + Title = {Edit distance with move operations}, + Author = {Shapira, Dana and Storer, {James A.}}, + Journal = {Journal of Discrete Algorithms}, + Year = {2007}, + + Month = {{jun}}, + Number = {2}, + Pages = {380--392}, + Volume = {5}, + + Doi = {10.1016/j.jda.2005.01.010} +} + +@Article{Shi:1993, + Title = {Multivariate data analysis in palaeoecology and palaeobiogeography---a review}, + Author = {Shi, {Guang R.}}, + Journal = {Palaeogeography, Palaeoclimatology, Palaeoecology}, + Year = {1993}, + Number = {3-4}, + Pages = {199--234}, + Volume = {105}, + + Doi = {10.1016/0031-0182(93)90084-v} +} + +@Article{Sidorov:2014, + Title = {Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model}, + Author = {Sidorov, Grigori and Gelbukh, Alexander and G{\'{o}}mez-Adorno, Helena and Pinto, David}, + Journal = {Computación y Sistemas}, + Year = {2014}, + Number = {3}, + Volume = {18}, + + Doi = {10.13053/CyS-18-3-2043}, + Url = {http://www.scielo.org.mx/pdf/cys/v18n3/v18n3a7.pdf} +} + @Article{Simpson:1949, Title = {Measurement of Diversity}, Author = {Simpson, {Edward H.}}, @@ -1220,6 +2869,39 @@ @Article{Smith:1981 Url = {http://www.sciencedirect.com/science/article/pii/0022283681900875} } +@Article{Sokal:1958, + Title = {A Statistical Method for Evaluating Systematic Relationships}, + Author = {Sokal, {Robert R.} and Michener, {Charles D.}}, + Journal = {The University of Kansas Science Bulletin}, + Year = {1958}, + + Month = {{mar}}, + Number = {22}, + Pages = {1409--1438}, + Volume = {38, part 2}, + + Url = {https://archive.org/details/cbarchive_133648_astatisticalmethodforevaluatin1902} +} + +@Book{Sokal:1963, + Title = {Principles of Numerical Taxonomy}, + Author = {Sokal, {Robert R.} and Sneath, {Peter H. A.}}, + Publisher = {W. H. Freeman and Company}, + Year = {1963}, + + Address = {San Francisco} +} + +@InProceedings{Somers:1998, + Title = {Similarity Metrics for AliAlign Children's Articulation Data}, + Author = {Somers, {Harold L.}}, + Booktitle = {ACL '98/COLING '98 Proceedings of the 36th Annual Meeting of the Association for Computational Linguistics and 17th International Conference on Computational Linguistics - Volume 2}, + Year = {1998}, + Pages = {1227--1232}, + + Doi = {10.3115/980691.980769} +} + @Misc{Song:2011, Title = {Typo-Distance}, @@ -1229,6 +2911,40 @@ @Misc{Song:2011 Url = {https://github.com/wsong/Typo-Distance} } +@Book{Sorgenfrei:1958, + Title = {Molluscan Assemblages from the Marine Middle Miocene of South Jutland and Their Environments}, + Author = {Sorgenfrei, Theodor}, + Publisher = {Danmarks Geologiske Undersøgelse}, + Year = {1958}, + + Address = {1--503}, + Number = {79}, + Series = {2} +} + +@Article{Steffensen:1934, + Title = {On Certain Measures of Dependence Between Statistical Variables}, + Author = {Steffensen, {J. F.}}, + Journal = {Biometrika}, + Year = {1934}, + + Month = {{may}}, + Number = {1/2}, + Pages = {251--255}, + Volume = {26}, + + Doi = {10.2307/2332058} +} + +@TechReport{Steingold:2015, + Title = {An Information Theoretic Metric for Multi-Class Categorization}, + Author = {Steingold, Sam and Laclav{\'{i}}k, Michal}, + Institution = {Magnetic Media Online}, + Year = {2015}, + + Url = {https://github.com/Magnetic/proficiency-metric/blob/master/paper/predeval.pdf} +} + @Misc{Stern:2014, Title = {DamerauLevenshteinAlgorithm.java}, @@ -1238,6 +2954,47 @@ @Misc{Stern:2014 Url = {https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java} } +@Article{Stiles:1961, + Title = {The Association Factor in Information Retrieval}, + Author = {Stiles, {H. Edmund}}, + Journal = {Journal of the ACM}, + Year = {1961}, + + Month = {{apr}}, + Number = {2}, + Pages = {271--279}, + Volume = {8}, + + Doi = {10.1145/321062.321074} +} + +@InProceedings{Stoilos:2005, + Title = {A String Metric for Ontology Alignment}, + Author = {Stoilos, Giorgos and Stamou, Giorgos and Kollias, Stefanos}, + Booktitle = {ISWC'05 Proceedings of the 4th international conference on The Semantic Web}, + Year = {2005}, + + Address = {Galway, Ireland}, + Month = {{nov}}, + Pages = {624--637}, + + Doi = {10.1007/11574620_45} +} + +@Article{Stuart:1953, + Title = {The Estimation and Comparison of Strengths of Association in Contingency Tables}, + Author = {Stuart, A.}, + Journal = {Biometrika}, + Year = {1953}, + + Month = {{jun}}, + Number = {1/2}, + Pages = {105--110}, + Volume = {40}, + + Doi = {10.2307/2333101} +} + @Article{Szymkiewicz:1934, Title = {Une contribution statistique {\`{a}} la g{\'{e}}ographie floristique}, Author = {Szymkiewicz, Dezydery}, @@ -1268,6 +3025,29 @@ @TechReport{Tanimoto:1958 Year = {1958} } +@Article{Tarwid:1960, + Title = {Szacowanie zbieznosci nisz ekologicznych gatunkow droga oceny prawdopodobienstwa spotykania sie ich w polowach}, + Author = {Tarwid, Kazimierz}, + Journal = {Ekologia Polska, Seria B}, + Year = {1960}, + Number = {6}, + Pages = {115--130} +} + +@Article{Tichy:1984, + Title = {The String-to-String Correction Problem with Block Moves}, + Author = {Tichy, {Walter F.}}, + Journal = {ACM Transactions on Computer Systems}, + Year = {1984}, + + Month = {{nov}}, + Number = {4}, + Pages = {309--321}, + Volume = {2}, + + Doi = {10.1145/357401.357404} +} + @Misc{Ticki:2016, Title = {Eudex: A blazingly fast phonetic reduction/hashing algorithm}, @@ -1286,6 +3066,26 @@ @Misc{Ticki:2016b Url = {http://ticki.github.io/blog/the-eudex-algorithm/} } +@InCollection{Tulloss:1997, + Title = {Assessment of Similarity Indices for Undesirable Properties and a New Tripartite Similarity Index Based on Cost Functions}, + Author = {Tulloss, {Rodham E.}}, + Booktitle = {Mycology in Sustainable Development: Expanding Concepts, Vanishing Borders}, + Publisher = {Parkway Publishers, Inc.}, + Year = {1997}, + + Address = {Boone, NC}, + Editor = {Palm, {Mary E.} and Chapela, {Ignacio H.}}, + Pages = {122--143} +} + +@InCollection{Turner:1988, + Title = {Packaging Information for Peer Review: New Co-Word Analysis Techniques}, + Author = {Turner, {W. A.} and Charton, {G.} and Laville, {F.} and Michelet, {B.}}, + Booktitle = {Handbook of Quantitative Studies of Science and Technology}, + Publisher = {New Holland}, + Year = {1988} +} + @Article{Tversky:1977, Title = {Features of Similarity}, Author = {Tversky, Amos}, @@ -1299,6 +3099,18 @@ @Article{Tversky:1977 Url = {http://www.cogsci.ucsd.edu/~coulson/203/tversky-features.pdf} } +@Article{Ukkonen:1992, + Title = {Approximate string-matching with q-grams and maximal matches}, + Author = {Ukkonen, Esko}, + Journal = {Theoretical Computer Science}, + Year = {1992}, + Number = {1}, + Pages = {191--211}, + Volume = {92}, + + Doi = {10.1016/0304-3975(92)90143-4} +} + @Misc{US:2007, Title = {Soundex System: The Soundex Indexing System}, @@ -1323,6 +3135,32 @@ @Book{US:1997 Url = {https://hdl.handle.net/2027/pur1.32754067050041} } +@Article{Upholt:1977, + Title = {Estimation of {DNA} sequence divergence from comparison of restriction endonuclease digests}, + Author = {Upholt, {William B.}}, + Journal = {Nucleic Acids Research}, + Year = {1977}, + + Month = {{jan}}, + Number = {5}, + Pages = {1257--1265}, + Volume = {4}, + + Doi = {10.1093/nar/4.5.1257} +} + +@Article{Maarel:1969, + Title = {On the Use of Ordination Model in Phytosociology}, + Author = {{van der Maarel}, Eddy}, + Journal = {Vegetatio Acta Geobotanica}, + Year = {1969}, + + Month = {{jan}}, + Number = {1--6}, + Pages = {21--46}, + Volume = {19} +} + @Article{Varol:2012, Title = {Hybrid Matching Algorithm for Personal Names}, Author = {Varol, Cihan and Bayrak, Coskun}, @@ -1375,6 +3213,58 @@ @Article{Wagner:1974 Publisher = {ACM} } +@Article{Wang:2014, + Title = {Extending String Similarity Join to Tolerant Fuzzy Token Matching}, + Author = {Wang, Jiannan and Li, Guoliang and Feng, Jianhua}, + Journal = {ACM Transactions on Database Systems}, + Year = {2014}, + Number = {1}, + Pages = {1--45}, + Volume = {39}, + + Doi = {10.1145/2535628} +} + +@PhdThesis{Warrens:2008, + Title = {Similarity Coefficients for Binary Data: Properties of Coefficients, Coefficient Matrices, Multi-way Metrics and Multivariate Coefficients}, + Author = {Warrens, {Matthijs J.}}, + School = {Universiteit Leiden}, + Year = {2008}, + + Address = {Leiden}, + Month = {{jun}}, + + Journal = {Psychometrika}, + Number = {3}, + Pages = {487--502}, + Url = {https://openaccess.leidenuniv.nl/bitstream/handle/1887/12987/Full_thesis.pdf}, + Volume = {73} +} + +@Article{Whittaker:1952, + Title = {A Study of Summer Foliage Insect Communities in the Great Smoky Mountains}, + Author = {Whittaker, {R. H.}}, + Journal = {Ecological Monographs}, + Year = {1952}, + + Month = {{jan}}, + Number = {1}, + Pages = {1--44}, + Volume = {22}, + + Doi = {10.2307/1948527} +} + +@Book{Whittaker:1982, + Title = {Ordination of Plant Communities}, + Author = {Whittaker, {Robert H.}}, + Publisher = {Springer Netherlands}, + Year = {1982}, + Number = {2}, + Series = {Handbook of Vegetation Sciecne}, + Volume = {5} +} + @Misc{Wikibooks:2018, Title = {Algorithm Implementation/Strings/Longest common substring}, @@ -1417,9 +3307,40 @@ @Misc{Winkler:1994 Url = {https://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c} } +@PhdThesis{Xiang:2013, + Title = {Similarity-based Virtual Screening: Effect of the Choice of Similarity Measure}, + Author = {Xiang, Hua}, + School = {The University of Sheffield}, + Year = {2013}, + + Url = {http://etheses.whiterose.ac.uk/5662/1/Thesis_Final.pdf} +} + +@Misc{Yang:2016, + Title = {New metrics for learning and inference on sets, ontologies, and functions}, + + Author = {Yang, Ruiyu and Jiang, Yuxiang and Hahn, {Matthew W.} and Houseworth, {Elizabeth A.} and Radivojac, Predrag}, + Month = {{mar}}, + Year = {2016}, + + Url = {https://arxiv.org/abs/1603.06846v1} +} + +@Article{Yates:1934, + Title = {Contingency Tables Involving Small Numbers and the $\chi$2~{T}est}, + Author = {Yates, Frank}, + Journal = {Supplement to the Journal of the Royal Statistical Society}, + Year = {1934}, + Number = {2}, + Pages = {217--235}, + Volume = {1}, + + Doi = {10.2307/2983604} +} + @Article{Youden:1950, Title = {Index for Rating Diagnostic Tests}, - Author = {Youden, William John}, + Author = {Youden, {William John}}, Journal = {Cancer}, Year = {1950}, Number = {1}, @@ -1429,6 +3350,39 @@ @Article{Youden:1950 Doi = {10.1002/1097-0142(1950)3:1<32::aid-cncr2820030106>3.0.co;2-3} } +@Article{Yujian:2007, + Title = {A Normalized Levenshtein Distance Metric}, + Author = {Yujian, Li and Bo, Liu}, + Journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + Year = {2007}, + Number = {6}, + Pages = {1091--1095}, + Volume = {29}, + + Doi = {10.1109/TPAMI.2007.1078} +} + +@Article{Yule:1912, + Title = {On the Methods of Measuring Association Between Two Attributes}, + Author = {Yule, {G. Udny}}, + Journal = {Journal of the Royal Statistical Society}, + Year = {1912}, + Number = {6}, + Volume = {75}, + + Doi = {10.2307/2340126} +} + +@Book{Yule:1968, + Title = {An Introduction to the Theory of Statistics}, + Author = {Yule, {G. Udny} and Kendall, {Maurice G.}}, + Publisher = {Griffin}, + Year = {1968}, + + Address = {London}, + Edition = {14} +} + @Misc{Zackwehdex:2014, Title = {Super Fast and Accurate string distance algorithm: Sift4}, @@ -1450,7 +3404,7 @@ @Misc{Zedlitz:2015 @InProceedings{Zobel:1996, Title = {Phonetic String Matching: Lessons from Information Retrieval}, Author = {Zobel, Justin and Dart, Philip}, - Booktitle = {Proceedings of the 19\textsuperscript{th} Annual International ACM SIGIR Conference on Research and Development in Information Retrieval}, + Booktitle = {Proceedings of the 19th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval}, Year = {1996}, Address = {New York, NY, USA}, @@ -1465,3 +3419,4 @@ @InProceedings{Zobel:1996 Numpages = {7} } +@Comment{jabref-meta: databaseType:bibtex;} diff --git a/docs/abydos.compression.rst b/docs/abydos.compression.rst index ed1d79a10..f730b6a65 100644 --- a/docs/abydos.compression.rst +++ b/docs/abydos.compression.rst @@ -4,5 +4,6 @@ abydos.compression package .. automodule:: abydos.compression :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.corpus.rst b/docs/abydos.corpus.rst index 07625936b..77119ccb8 100644 --- a/docs/abydos.corpus.rst +++ b/docs/abydos.corpus.rst @@ -4,5 +4,6 @@ abydos.corpus package .. automodule:: abydos.corpus :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.distance.rst b/docs/abydos.distance.rst index e546ee42d..33941eae0 100644 --- a/docs/abydos.distance.rst +++ b/docs/abydos.distance.rst @@ -4,5 +4,6 @@ abydos.distance package .. automodule:: abydos.distance :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.fingerprint.rst b/docs/abydos.fingerprint.rst index 6a3cbf0db..cb6c9e36a 100644 --- a/docs/abydos.fingerprint.rst +++ b/docs/abydos.fingerprint.rst @@ -4,5 +4,6 @@ abydos.fingerprint package .. automodule:: abydos.fingerprint :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.phones.rst b/docs/abydos.phones.rst index fb189db25..5caba12b3 100644 --- a/docs/abydos.phones.rst +++ b/docs/abydos.phones.rst @@ -4,5 +4,6 @@ abydos.phones package .. automodule:: abydos.phones :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.phonetic.rst b/docs/abydos.phonetic.rst index 46389466b..27c91c54d 100644 --- a/docs/abydos.phonetic.rst +++ b/docs/abydos.phonetic.rst @@ -4,5 +4,6 @@ abydos.phonetic package .. automodule:: abydos.phonetic :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.rst b/docs/abydos.rst index 6c968474f..7c505e2e8 100644 --- a/docs/abydos.rst +++ b/docs/abydos.rst @@ -4,6 +4,7 @@ abydos package .. automodule:: abydos :members: :undoc-members: + :private-members: :show-inheritance: Subpackages diff --git a/docs/abydos.stats.rst b/docs/abydos.stats.rst index 0f5154ae0..7e4338cd9 100644 --- a/docs/abydos.stats.rst +++ b/docs/abydos.stats.rst @@ -4,5 +4,6 @@ abydos.stats package .. automodule:: abydos.stats :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.stemmer.rst b/docs/abydos.stemmer.rst index 9fabd72cd..5a5e1e73c 100644 --- a/docs/abydos.stemmer.rst +++ b/docs/abydos.stemmer.rst @@ -4,5 +4,6 @@ abydos.stemmer package .. automodule:: abydos.stemmer :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.tokenizer.rst b/docs/abydos.tokenizer.rst index 492fb0694..cb20f3ca2 100644 --- a/docs/abydos.tokenizer.rst +++ b/docs/abydos.tokenizer.rst @@ -4,5 +4,6 @@ abydos.tokenizer package .. automodule:: abydos.tokenizer :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/abydos.util.rst b/docs/abydos.util.rst index df8ea076b..f80142668 100644 --- a/docs/abydos.util.rst +++ b/docs/abydos.util.rst @@ -4,5 +4,6 @@ abydos.util package .. automodule:: abydos.util :members: :undoc-members: + :private-members: :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index cfe47d04b..4001fd796 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -53,6 +53,7 @@ ] smartquotes = False +autoclass_content = 'both' # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -72,7 +73,7 @@ # General information about the project. project = 'Abydos' # noinspection PyShadowingBuiltins -copyright = '2018, Christopher C. Little' +copyright = '2014-2019, Christopher C. Little' author = 'Christopher C. Little' # The version info for the project you're documenting, acts as replacement for diff --git a/helpers/bm_php2py.py b/helpers/bm_php2py.py index e0407f95b..2986fcb59 100755 --- a/helpers/bm_php2py.py +++ b/helpers/bm_php2py.py @@ -341,24 +341,20 @@ def _run_script(): for s in subdirs: sd = s - tail_text += '\nBMDATA[\'' + s + '\'] = {}\n' - tail_text += 'BMDATA[\'' + s + '\'][\'approx\'] = {}\n' - tail_text += 'BMDATA[\'' + s + '\'][\'exact\'] = {}\n' - tail_text += 'BMDATA[\'' + s + '\'][\'rules\'] = {}\n' - tail_text += 'BMDATA[\'' + s + '\'][\'hebrew\'] = {}\n\n' + tail_text += "\nBMDATA['" + s + "'] = {}\n" + tail_text += "BMDATA['" + s + "']['approx'] = {}\n" + tail_text += "BMDATA['" + s + "']['exact'] = {}\n" + tail_text += "BMDATA['" + s + "']['rules'] = {}\n" + tail_text += "BMDATA['" + s + "']['hebrew'] = {}\n\n" tail_text += ( - 'BMDATA[\'' + "BMDATA['" + s - + '\'][\'language_rules\'] = _' + + "']['language_rules'] = _" + s.upper() + '_LANGUAGE_RULES\n' ) tail_text += ( - 'BMDATA[\'' - + s - + '\'][\'languages\'] = _' - + s.upper() - + '_LANGUAGES\n' + "BMDATA['" + s + "']['languages'] = _" + s.upper() + '_LANGUAGES\n' ) phps = [ @@ -442,8 +438,8 @@ def _run_script(): nl = True outfile.write( - '\n\nif __name__ == \'__main__\':\n import doctest\n\n\ - doctest.testmod()\n' + "\n\nif __name__ == '__main__':\n import doctest\n\n\ + doctest.testmod()\n" ) diff --git a/helpers/call_and_write_log.py b/helpers/call_and_write_log.py index 3bc1c7919..373a529cf 100755 --- a/helpers/call_and_write_log.py +++ b/helpers/call_and_write_log.py @@ -43,14 +43,13 @@ def _run_script(): - const_ret = None + const_ret = False if len(sys.argv) > 2: try: const_ret = int(sys.argv[2]) except ValueError: pass - retval = 1 if len(sys.argv) > 1: args = sys.argv[1].split() if args[0] not in { @@ -60,10 +59,12 @@ def _run_script(): 'doc8', 'pydocstyle', }: - sys.exit(const_ret if const_ret is not None else retval) + sys.exit(1) with open(args[0] + '.log', 'w') as output: retval = call(args, stdout=output, shell=False) # noqa: S603 - sys.exit(const_ret if const_ret is not None else retval) + if args[0] in {'pylint', 'pycodestyle'}: + retval = 0 + sys.exit(const_ret if const_ret else retval) if __name__ == '__main__': diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..0dd18d96c --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +# Library dependencies for the Python code. You need to install these with +# `pip install -r requirements-dev.txt` before you can run this. + +scipy +nltk +syllabipy diff --git a/requirements.txt b/requirements.txt index 319cc62e4..4f6f33044 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ numpy six +deprecation diff --git a/setup.cfg b/setup.cfg index 20cc89e00..be97bc1df 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,12 +47,13 @@ load-plugins=pylint.extensions.docparams,pylint.extensions.bad_builtin [doc8] verbose=0 +max-line-length=80 ignore-path=build,.tox,docs/_build,abydos.egg-info,tests/fuzz/corpora/blns.txt,abydos/phonetic/_beider_morse_data.py extensions=.py [pydocstyle] match=.*\.py -match_dir=[^\.](?!build|docs).* +match_dir=[^\.](?!uild|ocs).* ;convention=numpy ignore=D203,D213,D402,D202 diff --git a/setup.py b/setup.py index 9a4f504b7..64bf390c5 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,6 @@ def readfile(fn): 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', @@ -100,7 +99,7 @@ def readfile(fn): long_description='\n\n'.join( [readfile(f) for f in ('README.rst', 'HISTORY.rst', 'AUTHORS.rst')] ), - install_requires=['numpy', 'six'], + install_requires=['numpy', 'six', 'deprecation'], extras_require={ ':python_version >= "2.7" and python_version < "2.8"': [ 'pyliblzma>=0.5.3,<0.6.0' diff --git a/tests/__init__.py b/tests/__init__.py index 828c9ecb2..144f8e9ae 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -57,7 +57,7 @@ 'Nel', 'Kneale', 'Uí Néill', - 'O\'Neill', + "O'Neill", 'MacNeil', 'MacNele', 'Niall Noígíallach', diff --git a/tests/compression/test_compression_bwt.py b/tests/compression/test_compression_bwt.py index 145928434..34791ef71 100644 --- a/tests/compression/test_compression_bwt.py +++ b/tests/compression/test_compression_bwt.py @@ -37,22 +37,24 @@ class BWTTestCases(unittest.TestCase): """Test abydos.compression.BWT.encode and .decode.""" coder = BWT() + coder_pipe = BWT('|') + coder_dollar = BWT('$') def test_bwt_encode(self): """Test abydos.compression.BWT.encode.""" # Examples from Wikipedia entry on BWT self.assertEqual(self.coder.encode(''), '\x00') - self.assertEqual(self.coder.encode('^BANANA', '|'), 'BNN^AA|A') + self.assertEqual(self.coder_pipe.encode('^BANANA'), 'BNN^AA|A') self.assertEqual( - self.coder.encode( - 'SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES', '|' + self.coder_pipe.encode( + 'SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES' ), 'TEXYDST.E.IXIXIXXSSMPPS.B..E.|.UESFXDIIOIIITS', ) - self.assertEqual(self.coder.encode('aardvark', '$'), 'k$avrraad') + self.assertEqual(self.coder_dollar.encode('aardvark'), 'k$avrraad') - self.assertRaises(ValueError, self.coder.encode, 'ABC$', '$') + self.assertRaises(ValueError, self.coder_dollar.encode, 'ABC$') self.assertRaises(ValueError, self.coder.encode, 'ABC\0') # Test wrapper @@ -62,17 +64,17 @@ def test_bwt_decode(self): """Test abydos.compression.BWT.decode.""" self.assertEqual(self.coder.decode(''), '') self.assertEqual(self.coder.decode('\x00'), '') - self.assertEqual(self.coder.decode('BNN^AA|A', '|'), '^BANANA') + self.assertEqual(self.coder_pipe.decode('BNN^AA|A'), '^BANANA') self.assertEqual( - self.coder.decode( - 'TEXYDST.E.IXIXIXXSSMPPS.B..E.|.UESFXDIIOIIITS', '|' + self.coder_pipe.decode( + 'TEXYDST.E.IXIXIXXSSMPPS.B..E.|.UESFXDIIOIIITS' ), 'SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES', ) - self.assertEqual(self.coder.decode('k$avrraad', '$'), 'aardvark') + self.assertEqual(self.coder_dollar.decode('k$avrraad'), 'aardvark') - self.assertRaises(ValueError, self.coder.decode, 'ABC', '$') + self.assertRaises(ValueError, self.coder_dollar.decode, 'ABC') self.assertRaises(ValueError, self.coder.decode, 'ABC') # Test wrapper @@ -91,7 +93,7 @@ def test_bwt_roundtripping(self): ): self.assertEqual(self.coder.decode(self.coder.encode(w)), w) self.assertEqual( - self.coder.decode(self.coder.encode(w, '$'), '$'), w + self.coder_dollar.decode(self.coder_dollar.encode(w)), w ) diff --git a/tests/corpora/fake_words.csv b/tests/corpora/fake_words.csv new file mode 100644 index 000000000..9ab082437 --- /dev/null +++ b/tests/corpora/fake_words.csv @@ -0,0 +1,2401 @@ +error,corrected +abiersp,rabies +abile,able +abionia,Ionian +ableazines,sleazinesses +ables,bales +ablistiess,established +aboulged,abounded +abrasic,abrasion +abrics,fabrics +accher,cacher +acchs,Sachs +acclux,ACLU +acemented,cemented +achroucis,anachronism +aciste,racist +ackeses,wackes +acking,caking +acmed,acned +actal,actual +acton,Acton +acturrecos,resurrects +aculans,Cullan +acycludech,acyclically +adbahle,Allahabad +adbions,additions +adecolly,accolade +aded,add +adel,dale +adeleerer,Adelbert +adifoceble,modifiable +adightion,addition +adins,dins +adiscal,radical +adits,audits +adizermin,gormandizer +admines,amines +admitaing,admitting +adrehylay,rehydrate +aerhy,aether +aers,eyers +affasions,affections +afiers,afters +aftters,afters +ageouth,outrageous +agetrened,tragedienne +aggera,dragger +aggize,Aggie +aggly,waggly +agly,ally +agones,agonies +agracum,ageratum +aing,eying +aings,sings +alan,anal +aling,laing +alion,lion +alis,ails +alized,alkalized +allyal,ally +altery,alter +amariptort,participator +amatic,dramatic +ambatchmor,Cochabamba +amles,males +amnic,manic +amodiamirs,diamonds +amperodeus,ampersands +ampres,amperes +amureds,assureds +anaudionat,audition +anavors,savors +aned,abed +anic,antic +anilly,nilly +animunaly,animally +aning,saning +anistroons,anisotropy +anizate,animate +anks,ans +annizes,tyrannizes +ansy,nays +antenaer,antennae +anthes,antes +anticang,anticking +antioperte,antipoverty +antle,ante +antubbling,intubating +anulabine,alanine +aphancom,phantom +aphauns,haunts +apher,aper +aphol,Adolpho +aphypers,hypersphere +apidayst,apiarist +apiters,piasters +aplar,apolar +aportrater,portrayer +apporty,rapport +apreakers,streakers +apsourtal,rhapsodical +aptins,apt ins +aption,caption +aptne,paten +apyron,apron +arakent,partaken +aransm,Ransom +aratic,arctic +arationse,aerations +aray,ray +arbolisred,abolished +ardamang,Andaman +ared,ares +aress,areas +argistion,registration +arip,rip +arive,rive +arizatic,Adriatic +arized,prized +arker,archer +arlines,airlines +arrial,aerial +artendomat,heathendom +arthancers,furtherances +ascalising,miscalling +ascetione,Ascension +asenly,absently +aser,ares +asherelle,Asheville +ashisep,hashish +asing,aching +asivemited,assimilated +asivenes,evasiveness +askouse,gaseous +asoent,assent +asplate,as plate +assed,asses +atalin,Katalin +ater,tare +atider,arider +ation,nation +ational,atonal +atiosing,passionating +atity,atty +atuldether,leatherette +augs,rugs +auloced,located +aumbarte,barterer +aunsue,ensue +authon,author +autonially,autonomously +aveosiver,erosive +averses,averse +avoturity,maturity +awan,swan +awredly,aw redly +axachicad,chicaned +axamnism,Arianism +axesslised,relaxedness +axined,Maxine +babla,tabla +baccer,baker +bacuples,backplanes +bagely,bagel +balang,baling +baliter,baiter +ballity,banality +bangundise,Brandise +bantion,bastion +baran,barn +bardsont,adsorbent +barisatly,barbarically +barviss,Jarvis +bary,bray +bastely,basely +bathener,bather +baticated,abdicated +baticks,batiks +beal,veal +bealing,vealing +bectiner,Bertine +becupless,scruples +bedel,betel +bedinabled,indefinable +beele,bile +begotly,begot +bele,bee +beleshl,fleshless +belfwoming,becoming +belisel,bellicose +belyterid,terribly +bemborks,embarks +bemoder,be moder +bented,vented +benters,enters +bently,gently +beress,berets +bermick,mimicker +berota,be rota +bers,bees +bespoled,bes poled +bess,Bess +betial,bestial +bicing,vicing +bics,bis +bidly,idly +biggricke,bickering +bijaps,bitmaps +biliers,billers +bilike,bi like +billeed,bellied +bilmonard,billboard +binomperm,combiner +birrecleds,birdseeds +bistemone,moistener +bitaus,bi taus +bited,bighted +bity,bit +bizes,baizes +blealkst,leafstalk +blecculis,blissful +blediater,bleater +blendednes,blessedness +bles,bled +bletted,bleated +blextre,extricable +boact,boat +boatand,boat and +bobion,bob ion +boeth,both +bohed,boned +bohy,boy +boismonic,Mormonism +bokasity,verbosity +bolearis,Borealis +boler,bole +boles,lobes +bolous,bolos +bonally,banally +bons,bins +booferisms,boosterism +boons,buns +bork,Bork +bosm,bosom +botobte,Bobette +bously,gibbously +bowess,bowers +boxycherds,chessboards +brably,brambly +brabown,Brownian +bralyase,brassily +brarily,arbitrarily +brate,brat +bratisaph,Bratislava +brectics,anorectics +brent,Brent +breon,freon +brihious,lubricious +brof,bro +brogic,biologic +buctindple,conductible +bulass,bu lass +bulcon,bullion +bulneg,bulge +bunduans,Burundians +bunness,busyness +burkincers,burnishers +burlity,Burlie +burs,boors +burzing,burning +busail,bu sail +butivingly,bunglingly +butly,butyl +butrage,outrage +butring,burring +cabercum,caber cum +cabilers,calibers +cadrovered,overeducated +cags,cages +calaus,callus +cald,clad +cale,lace +calemorer,clamorer +calic,calico +calimped,ca limped +calinete,Catiline +calited,calibrated +calitifuls,capitalistic +calizate,caliphate +calize,canalize +calizes,canalizes +cals,lacs +cancy,chancy +candiked,candied +candippy,can dippy +canessity,necessity +cannialing,canalizing +cantion,canton +caphee,Cathee +capotion,caption +captisent,antiseptic +carawls,crawls +carbarboon,carbonation +cass,sacs +casserang,canvasser +catentionc,catenation +cateria,Caterina +cating,acting +cationt,cation +catogeable,allocatable +cattis,catties +caturil,caricature +cauted,caused +caux,aux +ceate,crate +cedly,redly +cend,vend +cenes,scenes +cenlisid,dissidence +certgoty,certainty +cetapic,acetic +chabily,charily +chactity,chastity +chast,chats +chawforrum,Crawford +chboloss,Schloss +chea,sea +ched,teed +chemes,hemes +chers,tiers +ches,tees +chhona,cinchona +chies,chis +chil,chi +chily,chilly +chirt,chit +chises,sises +chlorsible,chlorine +chness,chess +chning,tining +choe,chow +chons,sons +chormill,Churchill +chortchly,chortler +chrolocy,chronology +churations,conjurations +chutip,chip +cier,vier +cieter,dieter +cinglin,tingling +cionly,sonly +cise,ices +cition,cation +citize,citizen +cive,vice +clables,cables +clackamb,blackamoor +clal,call +cled,clef +cledisord,disordered +clens,clench +cler,clear +clers,clears +cles,clues +cleurvene,cleverness +clier,clear +clies,clues +clis,clips +clisoper,clipper +clours,clouts +clufrotte,clutter +cnerk,clerk +coaddic,addiction +coarplaten,copperplate +coate,coach +cocialize,socialize +cociess,cockiness +codepads,code pads +coer,cur +coftisater,commiserate +cogializes,socializes +cognes,cones +cohduatove,commutative +cohmisaity,comicality +cohylly,Colly +colabowed,cola bowed +colaterity,collateral +colcifeds,coloreds +colco,coco +coldse,colds +colete,Colette +coliess,coolies +colivenes,costiveness +colized,colorized +colizindes,collides +colled,coled +colocker,clocker +colous,coleus +colow,colon +colows,colons +comang,coming +comere,comer +comeress,compress +comerocest,comestible +comilnes,comeliness +comitille,committable +comle,come +comphic,complicit +complarcic,complainer +comproid,compromised +conater,coater +conatlered,consternated +conco,coco +conders,cinders +conentess,contriteness +conesering,cone sering +confle,conflate +coniteles,aconites +cono,con +conoschily,consciously +consar,consular +conscrejar,consecrate +conses,cones +consing,conching +cont,cony +conts,contd +convulte,convulse +cophidly,lymphoid +cophotpic,photocopier +copi,cop +coplery,coppery +coppe,cope +copulysts,copulates +coquir,coquetry +corass,crass +corectight,corrective +coriss,Coriss +cormile,Corneille +corplart,corporal +cors,cots +cortaines,certainest +cory,Cory +cotioniss,coalitionist +cotomeavid,photometric +cotors,rotors +cotry,co try +cougners,countersign +cougnoth,cottonmouth +counbural,conjectural +counnity,community +couring,coring +cous,couch +couse,course +couss,cuss +coute,couch +coutived,outlived +crace,crave +crading,carding +craper,carper +craphiness,graphicness +cratolegly,collegiately +creado,credo +crection,erection +crectork,corrector +cred,red +cres,res +crial,rial +crike,trike +crinalve,criminal +crinconize,colonizer +cristilize,Christianize +criving,riving +croccused,caroused +crochy,crochet +crocla,Wroclaw +croculach,oracular +crodoltion,introduction +croilition,coalition +croing,crewing +croped,coped +crophydner,hydrophone +croses,crises +crous,crouch +crumonic,unmnemonic +cryphe,cypher +cubiandens,custodians +cullight,taillight +culviviver,circulative +cummingly,Cummings +cureferred,cu referred +curlea,curler +curottoid,carotid +cutious,curious +cycogracke,crackerjack +cymptity,captivity +dained,fained +dalizes,vandalizes +danted,dated +dasal,nasal +dayagic,Yagi +deadent,deaden +decanize,canonized +decationg,defecation +decess,recess +decloss,closeness +decoric,decor +decoushana,decorous +dectalial,dialectal +decte,detected +dectis,detectives +dectoctry,directory +deculvidic,ridicule +decurliger,deciliter +dedric,Cedric +deged,edged +deisher,dasher +deler,deer +dembles,dissembles +demorstion,demolition +dempers,dampers +denal,deal +denars,debars +dending,sending +denet,dent +dengly,denyingly +dentalega,dental +deoximper,peroxide +depanad,deadpan +dephize,deputize +depled,depleted +deptic,septic +deranise,derange +deremers,demurrers +deriable,derivable +derisvise,derisive +derizate,derivate +deroing,redoing +derophoal,spheroidal +derrates,serrated +ders,reds +dess,fess +dethed,detached +devenths,sevenths +dever,sever +dewawle,dewclaw +dextists,dentists +diactrise,distracted +dialime,medial +dially,dually +diamy,diam +dianners,dinners +diannes,Diannne +diatopoly,oligopoly +dibly,dilly +dicaliss,radicalism +dican,divan +dici,dis +diciasmar,radicalism +dicrith,diacritic +dicry,dicey +diedynoss,diagnosed +dieloge,die loge +dier,dear +diesulsans,diocesans +diferte,different +diffes,differs +difyinely,divinely +diller,rilled +dily,idly +dimison,Madison +dimmus,dim mus +dinations,donations +dinegera,Geraldine +dinhecte,directed +dinsh,dins +dintion,diction +diopar,parodied +diousnes,odiousness +diphulan,Dilan +dippring,dipping +disass,dis ass +dise,dies +dised,sided +dises,sides +disessomy,lissomness +disler,dialer +dism,dims +diss,dis +distarrims,diarists +distonoff,distortion +diteling,datelining +dittlend,tittled +dity,tidy +diumning,dinning +divels,devils +dizateary,dietary +dizatisent,digitization +dize,size +dizentopy,entropy +dizes,disses +dizess,iodizes +dochans,chansons +dochess,duchess +dogamon,monogamy +dogicash,dogfish +dogisatic,dogmatics +dognpers,nonperson +dolingus,doling us +domer,moder +donds,fonds +donize,ionize +dorearizes,deodorizes +doring,roding +dorrised,Dorise +dothots,dot hots +dotornizes,motorizes +douffite,outfitted +doush,douse +dowgic,Lodowick +dowl,fowl +drang,grand +dranst,strand +drantermic,determinant +drants,drats +drater,darter +draywer,drawer +droadle,drollery +dron,don +drot,trod +duats,dusts +dulanity,modularity +dulmidorip,humidor +dulur,dull +dumetron,metronome +dunrerter,unreported +duristuran,disturbance +dwaboapar,paperboard +eadjayst,adjusted +eadwom,madwomen +eboura,Debora +ecksm,checks +econcel,conceal +ecte,detect +ection,section +ectipless,electiveness +ectrion,trisection +efact,fact +efing,feign +efinis,finis +eflard,flared +efunds,funds +efying,eying +eigness,bigness +eium,medium +ejecad,ejecta +ejusubsolo,subsocietal +eled,eked +elem,elem +eler,leer +elibley,eligible +elizate,Elizabet +elly,Elly +elogy,logy +elom,elm +elum,elm +emation,emotion +emblenon,ensemble +emet,meet +emics,emacs +emidinise,semidefinite +emilizedia,familiarized +emin,min +emisbut,euphemist +emiseird,premised +emmed,gemmed +emnatious,emanation +emonind,demonic +emparizer,emphasizer +emparnes,nearnesses +enations,nations +encal,encl +encault,Renault +ences,hences +encori,encore +ency,envy +enduckon,duckpond +enessubdum,sublimeness +enferrouky,nonferrous +engus,genus +ennizatic,enigmatic +ennows,endows +enopir,Renoir +enry,entry +entater,en tater +enterved,entered +enthology,ethology +entle,gentle +entroth,en troth +ents,nets +enuent,unguent +enwons,en wons +enyies,envies +ephiroier,prophesier +eption,option +equelizins,equalizes +equers,ewers +equiezinal,equinoctial +eragous,ragouts +ereatic,erratic +erfbane,banner +ericastal,peristaltic +ering,reign +erlogres,ogresses +erted,treed +erts,erst +esily,easily +esurgatite,surrogate +etrinally,doctrinally +etris,eris +etrong,strong +ettilis,librettist +eult,ult +euts,ruts +evardizer,standardizer +eveighs,inveighs +evessorce,sorceress +evions,evictions +evitessity,emissivity +evolisi,scoliosis +evoromon,Voronezh +exablist,establish +exanneshor,shortness +exclowids,exclusions +excreau,excreta +excvindle,dwindle +exent,extent +exenthols,menthols +exeraboy,execrably +exern,extern +exes,exes +exesid,exes id +exess,exes +exhaw,ex haw +exiblers,flexible +exic,exec +exord,order +exotalcome,exotically +expeatte,expatiate +expers,experts +expery,expert +expete,expedite +expole,expose +expraysive,expressive +extess,excess +extordos,extorts +extramisms,extremisms +extrox,extrovert +exts,ext +exuanes,nexuses +exus,nexus +facatiard,tachycardia +facenoff,effacement +fachiph,hibachi +facic,facial +fack,flack +faclined,inclined +facoly,acolyte +facs,fas +faincaings,fairings +falails,flails +faling,faking +fallazzon,Fallon +fantations,incantations +farism,far ism +fased,phased +fashing,dashing +fations,factions +fatisms,fat isms +fausnes,famousness +feamisted,feasted +fecis,fess +fecoments,comments +fecredendo,crescendoed +ferier,fierier +ferms,germs +ferosquing,squinting +fershew,fer shew +ferwoly,wolfer +fetres,ferrets +fible,feeble +ficilging,financing +ficiurices,sacrifices +ficon,icon +fidenegrak,degrading +fidines,tidiness +fiect,fie ct +fier,fair +fiernise,finisher +filemna,filename +filiver,fiver +fing,ding +firon,iron +fisemal,fishmeal +fizema,Fizeau +flabler,fabler +flannes,flannels +flawns,flaws +flextry,flex try +flopic,floppily +flumises,flu mises +fluritive,nutritive +fluss,flus +flusty,fusty +fograpers,fog rapers +foisappack,disappointed +folied,foiled +folovats,footstool +fonaer,fonder +foody,goody +forap,foray +fordle,fondle +fored,gored +forewass,forewarns +forieres,foreseer +forierse,foreseer +forily,gorily +forized,Sanforized +fors,fros +forsh,frosh +fosm,foam +fous,fours +fragized,fraternized +fralist,federalist +frate,grate +frattess,frustrates +frazinial,Frazier +fred,Fred +frest,frets +fric,uric +frise,fries +froele,Rozele +frolota,frontal +froussell,Larousse +frumal,frugal +fugh,ugh +fulager,fuselage +fulate,plateful +fulatiolly,functionally +fulbers,fullers +fuldes,fullness +fule,flue +fulke,fluke +fulnece,fullness +fulned,funned +fultis,cultist +funestry,funnest +fung,dung +futtly,futilely +gaide,aide +galiale,Galilee +galisines,gassiness +galiusing,signaling +gamenise,gamesmen +ganic,manic +garin,grain +gase,ages +gass,sags +gatili,Gatling +gatiling,Gatling +gationoxes,gestation +gatium,gallium +gatorting,gator ting +geet,git +gentiving,agenting +geogmet,geog met +geousnes,gorgeousness +geratiess,generates +gerearatee,regenerate +gerphirar,geographic +gerthum,Humbert +giater,gaiter +gice,give +gile,file +gionisting,insisting +gises,guises +gity,git +gize,size +gler,glee +glesisde,legless +glifous,glorious +glogy,logy +glor,glory +glotgrowns,outgrown +glyst,straggly +glyzably,analyzable +gologiver,apologizer +gonarin,Goldarina +gonic,sonic +gonserned,governed +gorintincy,contingency +gosemmy,goose +graigm,Graig +graing,grain +graliesse,graceless +gralspant,grandparent +granes,granges +granic,granitic +grany,grant +graphos,graphs +grapil,grail +grast,grass +gratoreaty,migratory +graval,gravel +grayarises,gray arises +grayse,grays +gred,dreg +gres,ergs +grilly,frilly +grioneer,pioneer +grizatib,Kiribati +grobfic,robotic +grogic,groggily +gromb,groom +guane,guano +gulfor,gulf or +gultheria,Gualterio +gunching,hunching +gunde,guide +gutrus,gurus +hable,gable +hablon,halon +haendogess,handsomeness +haer,hear +hailises,syphilises +halipsont,antiphonal +halism,Alisha +halizaters,totalizators +hamodulate,ha modulate +hamon,halon +hanchly,chancy +handus,hands +hanial,Danial +haniffine,iffiness +hanipals,municipals +hanise,anise +hannes,hennas +hapistogen,pathogenesis +harboling,harboring +harger,charger +harial,ha rial +harilly,charily +hartudge,Carthage +hass,sash +havisially,behaviorally +heards,heard +hecultia,cultivate +hedahen,Daphene +heladultid,adulation +hele,heel +helebris,Hebrides +heles,heels +hematiling,humiliating +hemong,hegemony +henally,penally +heous,house +heower,hewer +herast,Hearst +heront,heron +hersh,Hersh +hervilry,chervil +hesheaved,he sheaved +hetionizin,ionization +hetting,getting +hilectal,dialectal +hing,hung +hinnizess,shininess +hisatral,historical +hober,hover +hobly,holy +hocyclogy,psychology +holass,ho lass +holons,colons +holoss,ho loss +homentrum,momentum +hongler,shingler +hoodop,hoodoo +hoome,home +hoomerturs,homerooms +hoona,Poona +hooteds,hooted +hories,hosier +hormate,formate +hothsm,smooths +hotiong,hooting +hotorism,motorist +hots,host +houbtager,homager +huady,heady +huffunjoly,huffily +huncy,hunch +hurred,hurried +hushwhumy,bushwhack +hybredly,hybrid +hydrewily,hydrolyze +hydriknuce,hydraulicked +hylayst,Shaylah +hypeated,hyphenated +hypnopmer,hypnotherapy +hypt,hype +hystolop,hyssop +hyterers,hysteresis +hythloged,mythologized +hytorymise,prehistory +hytrost,Ostrogoth +iddly,oddly +ident,dent +ider,dire +idimmer,dimmer +igan,gain +iged,aged +igic,Gigi +ilavery,slavery +iled,lied +ilherles,cheerless +ilised,irised +ilitic,illicit +illically,idyllically +ilorise,Deloris +imah,imam +imakdal,decimal +imater,mater +imbler,nimbler +imbly,nimbly +imbravide,braider +imed,idem +imel,mile +imet,mite +imilled,milled +imised,irised +imity,dimity +imized,minimized +imphy,imply +impievel,impulsive +implate,imp late +imulme,mule +inably,in ably +inailizess,internalizes +inaircy,Irina +inal,final +inalions,inclinations +inaly,inlay +inar,dinar +inboar,inboard +incer,nicer +incoly,Lincoln +incree,increase +incy,inc +indes,ides +inds,ind +indsorus,Windsors +inemboly,embolism +ines,Ines +iness,sines +inesse,finesse +inetrang,penetrating +infless,inf less +infrallues,influences +ingang,in gang +ingents,ingests +inglutook,inglenook +ingrace,ingrate +ingsittic,sitting +inick,nick +inising,incising +inity,unity +inkelly,inkwell +inkity,infinity +inned,inned +inni,inn +inoe,Linoel +inquiles,inquires +inser,inset +insinify,intensify +intaphes,intactness +inte,inch +inthip,int hip +intist,insist +ints,nits +inuckhoot,bucktooth +inya,Anya +iondram,ion dram +irmered,Meredithe +irmines,ermines +irreadle,treadle +irte,rite +isalises,valises +isepuslis,republish +ishlexan,mishandle +itality,vitality +ited,tied +jadepinth,Jacinthe +jagge,jagged +jaing,jading +jandron,Alejandro +jaraxise,praxis +jarped,harped +jecleining,cleaning +jerblamous,blasphemous +jeway,leeway +jilla,villa +jingoly,jingly +jireles,tireless +jogglater,joggler +joise,noise +josquist,mosquito +joups,soups +joustracon,constraint +jowed,owed +judst,just +jugootize,robotize +justal,Justus +kagee,leakage +kallrysed,Krystalle +kaosing,Kaohsiung +kenditier,rendition +kents,kens +keoporms,preforms +kerize,Keri +kers,kens +kete,kite +keting,keying +kinaphony,phony +kinatomyth,anatomy +kindes,kinds +kipeascing,peacemaking +kistries,dentistries +kity,kit +klocked,locked +knation,nation +knocate,invocate +kolled,lolled +labagina,imaginable +labic,labia +lablizess,stabilizes +lacule,lacunae +lader,alder +lagenta,magenta +lagizedial,digitalized +lagizes,plagiarizes +lallics,metallics +lanabright,Cantabrigian +lang,Lang +lanoated,annotated +lans,labs +lant,lat +lapors,labors +laralmant,antimalarial +larizes,velarizes +larnow,Arno +latabled,la tabled +lation,talion +lationa,national +latiors,violators +latist,latest +latore,la tore +latoser,lactose +layal,loyal +layst,lays +lebrosts,frosts +lect,let +legled,gelled +lese,else +letion,lesion +lexily,sexily +libidef,libido +licion,Felicio +lienst,liens +ligaric,oligarchic +ling,king +lininabi,libidinal +lisapper,misapplier +lise,lies +liser,loser +lises,loses +lisiggatch,latching +lisin,Lissi +lism,ism +listic,stylistic +lizable,liable +lizinescry,crystallizes +lizing,sizing +lizings,sizings +logiewy,logical +logity,logicality +lomatied,diplomatic +lonte,lento +lootne,looter +louskeasts,Southeasts +lubic,cubic +lubinfeers,inferiors +lubminsfu,luminous +luey,gluey +lunlanida,Daniella +lury,lure +lustathoer,statehouse +maccless,manacles +macers,macer +mached,maced +mactres,actresses +madrably,ma drably +magge,magged +magle,male +magnodomes,diagnometers +maity,amity +malics,malices +malie,male +maling,malign +mally,mall +manardics,mansards +manc,man +mand,mans +mandintly,demandingly +manduader,demander +manize,maize +manized,humanized +manne,mane +manova,ma nova +mansm,mans +mansynt,manservant +mantates,mandates +mante,mane +manted,mantes +mantrace,man trace +mants,manta +mards,mars +marinism,Arianism +marve,Marve +mase,same +maslon,mason +materaing,mattering +materseck,checkmate +matie,mate +matimpox,Appomattox +matin,main +matingly,ma tingly +mation,mason +mationaph,machination +mations,masons +matives,natives +matortore,reformatory +matoterest,overestimate +mattess,mattes +meampse,seamer +meation,mention +mefretry,telemetry +megant,meant +melfs,melds +melisic,Melisse +mely,melt +menaings,meanings +menannes,meannesses +menceroty,commencer +mendead,mended +meng,men +menizate,Mennonite +mensiong,mensing +mersatell,satellite +metativer,meditative +metemons,monsters +metionak,mentionable +mettes,metes +meturing,metering +mialle,millennial +micappe,pemmican +micartiona,ramification +micate,medicate +micaus,micas +mickies,sickies +micrailhed,microfilmed +micry,miry +mics,misc +midalies,miladies +miertmerak,shirtmaker +migy,miry +milimid,Mildrid +milise,Milissent +mily,limy +minaliver,miniver +minfek,feminine +mingly,mingle +mingnoots,nothingness +mique,mike +mirliever,reliever +misake,mistake +miscid,misdid +mised,mused +mish,mush +mishitese,mishitting +misie,Missie +mism,mis +miste,mist +misyl,Missy +mityph,Mitty +mitz,mite +mize,maize +mizes,misses +moca,coma +mocle,mole +modine,iodine +moeigen,Imogen +moling,mewling +molisata,moralist +monovely,monopoly +monserphic,meromorphic +monsings,Monsignors +monsivelen,Simmonsville +monsowdess,monsoons +moolic,Mongolic +mooptiorps,resorptions +mord,dorm +moreds,mored +morgang,Morgan +mormuslay,glamorous +morric,morris +mortuts,mortuary +motle,motel +mounron,Monro +mous,nous +mousity,animosity +mouslerl,mouser +moustic,acoustic +moustin,mousing +mouts,mots +mulabigran,Mulligan +multes,mules +munbals,murals +munes,menus +munic,music +munizing,immunizing +munmer,mummer +munnus,mun nus +murf,surf +murivil,Murillo +murloval,Murial +mutiousion,mutilation +myed,med +myos,mys +myosose,Yosemite +mytortzy,mythology +nabity,tenability +nached,naked +naclevaty,tabernacle +nalize,nasalize +nallang,gallanting +nally,ally +naphars,naphthas +narmated,narrated +narons,barons +natiggisty,nationalist +neeits,neats +nell,Nell +nequal,equal +nerb,nerd +ners,nets +nertang,Nerta +nesed,need +ness,sens +nisatiffil,affiliation +nisdowmal,Malinowski +nocisions,decisions +noclic,noncyclic +noct,cont +nofter,noter +nograval,granola +nogwors,worsens +nogy,nosy +noins,nouns +nolam,no lam +nolameente,lamented +nomes,nones +nomovers,no movers +nompt,Compton +nompuric,monomeric +nomyophys,physiognomy +nons,non +nortated,notated +notonchord,nonconductor +nouts,bouts +noveym,novene +nulne,ulnae +obbleneony,cobblestone +obeaminals,abdominals +obed,bode +obilbel,mobile +obions,onions +obitze,obit +oble,bole +obleyst,noblest +oblogere,blogger +ochreel,Rochell +ocknogy,Cockney +ocrals,corals +odueaddial,dialogue +odys,odes +oflovelles,lovelessness +ofter,softer +olecoaccom,accomplice +olowelwary,hollowware +onacry,coronary +opagambed,propagated +oper,pore +operke,openwork +ophic,optic +ophogne,homophone +oplogim,imploring +opollst,pollster +opper,topper +opplenting,implementing +opses,poses +oradayera,forayer +ordo,rood +oribed,orbed +oring,iring +ornomental,ornamental +orped,roped +orse,rose +ortril,trillion +ourist,purist +ouror,furor +ousless,tousles +outter,putter +ouzz,ouzo +ovatifier,ratifier +oved,roved +ovelly,lovely +ovenatine,venation +oveneout,novene +oventioges,inventions +oves,voes +oviges,overages +ovors,overs +oxestertor,stertorous +oxice,oxide +oxingus,Xingu +oxis,xis +oxise,oxide +oxylet,oxyacetylene +oxyter,oyster +ozing,zing +pachood,Packwood +pacranum,cranium +pactind,pact ind +padile,paddle +pagersting,pager sting +palees,pales +paliappyr,partiality +palint,plaint +palka,parka +panal,anal +pand,pans +panker,spanker +paranyal,paranormal +parcum,par cum +pardste,pasteboard +paria,para +paripht,pariah +parles,pares +parloretry,parquetry +parogy,parody +paront,patron +partions,parsons +patarmins,ptarmigans +patentects,patentees +pathorbing,path orbing +pati,pas +patieng,patient +patioge,patio +pativ,patio +patoursed,coursed +paty,pat +pautast,pasta +peasm,peas +pecapiesse,escapism +peckle,speckle +pectial,pectoral +pection,pectin +pedly,redly +pelen,peen +peluffult,peaceful +penctolin,penicillin +penneter,pentameter +pense,pens +pente,pent +pepaph,epitaph +peptorp,preceptor +perally,penally +perbang,per bang +perbirter,perverter +percon,person +peres,pairs +peroshers,perishers +peroy,perky +perrize,Perrine +pers,perch +perveny,perverter +petaturect,prefecture +phalotion,inhalation +phatinad,patina +phemer,ephemera +phendeake,Daphene +phents,pents +pher,fer +pherwory,periphery +phible,bibliophile +phic,chip +phie,fie +phinte,hinter +phion,pion +phisesep,Pharisee +phitica,Titicaca +pholdweama,placeholder +pholong,prolong +phossing,phosphine +phydrize,hydride +phydrobey,hydrophobic +phyposp,phosphor +phys,phis +physts,phys ts +pical,pica +pictin,pectin +pidallin,Idalina +pien,pain +pieresse,presser +pight,pit +pingerdam,pinger dam +pinvorin,pivoting +pinvulats,insulates +pirally,spirally +pircaont,Pitcairn +pireaug,spirea +pirk,pork +pirpred,perspired +pirrang,parring +pirric,empiric +pistedly,disgustedly +pistry,pastry +pitents,potents +piting,piing +placcial,glacial +plader,pleader +plancon,plan con +planess,planes +plaper,paper +plaphizate,philatelist +plarlanted,transplanted +platical,plastically +plations,platoons +plec,plea +pleds,pleads +pler,per +plerity,celerity +plessed,pleased +pletillize,palletized +pliatess,palliates +plined,plied +plisa,Elisa +plitises,colitises +plobing,lobing +ploguls,Moguls +plutionnom,pollution +poidly,pointedly +pokess,pokes +polammon,Lammond +poless,poles +pollora,Apollo +polly,Polly +polonce,pol once +polore,pol ore +polvally,polytonally +pometes,poetesses +pompates,pomp ates +ponation,donation +ponpoulate,populate +pordizes,jeopardizes +porely,sorely +pors,porch +posarely,positively +poscuscous,couscous +posisatcal,positional +postost,postorder +posuffness,bluffness +potabouses,potables +potips,potions +pous,pouch +powl,pow +praphors,phosphors +prawes,prawns +preals,peals +precies,preses +precomer,prerecord +prectic,pectic +pred,peed +preflor,preform +pregeassub,preassign +prele,prelate +premo,promo +premps,preps +prensfuse,profuseness +prepatince,preparation +prephes,prep hes +prer,peer +prermon,Hermon +prerriumed,presumed +pres,Pres +presice,precise +pressibon,repression +prestal,prenatal +pretrecer,pretreated +prevatent,prevalent +previses,precises +prialike,pretrial +priems,primes +profly,pro fly +proge,grope +prolos,polos +proloviver,proverbial +prooke,provoke +prope,prop +propix,prolix +propluslum,slumberous +prositon,positron +prot,port +proted,ported +protracits,protracts +proug,group +prownies,brownies +proy,pro +pruse,purse +pryolly,prolixly +pties,pries +ptur,pour +pudosiony,Dionysus +puet,put +puggle,juggle +pulaing,pulsing +pulatic,copulative +pulized,pulverized +pulnard,ulnar +puls,pools +pult,put +pums,poems +puncops,pun cops +puped,pooped +pupets,puppets +purdlized,pluralized +puring,pursing +purowelety,troweler +pyloriers,pyloric +quabless,qua bless +qual,qua +quall,wall +quarcends,archfiends +quargno,quarto +quarwe,quarter +quasm,quasi +quenterwa,frequenter +ques,quest +quic,quick +quiesside,quietness +quing,king +quirema,quire ma +quis,quid +quouguess,outguess +rabstisker,asterisked +raccomy,raccoon +rachables,reachable +rackultry,racetrack +radmisters,rad misters +rafess,strafes +raffic,traffic +ragappigen,entrapping +ragian,raglan +railly,rally +raling,raking +ralise,raise +rals,reals +raming,arming +rannes,ranees +rapectanes,spectralness +raph,rap +rasm,rams +rateroyed,ratepayer +rativils,antivirals +ratobbise,rabbinate +ravorkpism,favoritism +rawalighty,ultralight +raxyped,rapped +reabley,readable +reacill,bacilli +readly,dearly +reaft,raft +realics,relics +reamarber,forbearer +reametry,creamery +rean,near +rece,rec +rechestal,recherches +recid,riced +recon,econ +recromens,recompense +recry,retry +rected,erected +redewery,brewery +redirs,riders +redowes,resowed +reemped,preempted +reer,teer +reerizierm,mesmerizer +reess,seers +refetilon,refection +reflerve,reflexive +refuenity,serenity +regemon,hegemony +regged,egged +rehacous,Cretaceous +reinict,restrict +reit,REIT +rejughthy,rethought +rekho,Redhook +relast,recast +relburduce,reproduce +reles,reels +rellon,Mellon +remancts,remnants +remors,remorse +remph,re mph +rency,ency +reniats,Renaults +renomarrup,renouncement +repart,repast +rephicke,pickerel +rephts,rests +repie,repine +repieval,retrieval +repit,resit +replawk,replace +reppert,rep pert +reprellise,repelling +repte,repute +rers,errs +rervante,Cervantes +rese,retie +resomed,resoled +resotenide,resoluteness +resped,reaped +ress,reds +ressaphord,semaphored +ressiveive,repressive +ressnalley,restlessness +ressofless,roofless +resyners,reasoners +retel,reel +retensial,tensorial +retent,retest +reth,retch +rethequing,reequipping +reting,resting +retols,retold +retteltial,retaliate +rewalian,reliance +rewitorts,redepositor +rezon,rezone +rhants,rants +rhed,red +rhent,rent +rhetings,sheeting +rhised,raised +rifing,reefing +risatente,orientates +roate,roach +robilly,rob illy +rocion,coercion +roid,rood +roise,rouse +rolinentit,nonentity +ronsts,roasts +rontrandly,contraband +roort,root +rophines,morphines +rossable,crossable +rostrystio,prostration +rournal,journal +rours,tours +ructually,structurally +runders,rounders +sabed,based +sabily,shabbily +sably,ably +sachery,Zachery +sadefation,defamation +saffecuts,affects +sagent,agent +sagerecry,savagery +sagnanking,Nanking's +saily,daily +saircons,zircons +saisisal,sisal +salitte,salivate +samic,balsamic +sapolarts,transpolar +satac,sat ac +saters,asters +sathe,sate +sathor,satori +satically,statically +saticande,satanical +saticted,satiated +satilly,sat illy +satiogness,cessations +sationjarp,satiation +sationts,stations +satizably,insatiably +satudius,studious +sauper,super +sayst,says +scally,scaly +scaperther,scapegrace +scatroung,scrounge +schaegge,chargers +schagelum,flagellum +schent,scent +schundion,discussion +scider,sider +scockpirde,stockpiler +scods,sods +scolousive,collusive +scon,son +scorte,scorch +scourn,scour +scowarces,cowardices +scrats,scratch +scullwery,scullery +seang,seeing +secres,seres +secturry,sectary +sedims,bedims +sellcud,sell cud +semove,remove +seprizing,apprizings +serifers,serifs +serpher,Serpens +sers,tiers +sesces,senescences +sess,chess +sesty,chesty +setatoner,stationer +setroce,Sheetrock +setrootoi,beetroot +shafer,Shafer +shalic,chalice +shalition,coalition +shanthemot,Shantee +sharised,shared +shasprite,sprite +sheads,sheafs +sheard,shared +shess,sheds +shhorid,horrid +shile,shale +shist,hist +shly,shy +shmed,shed +shmiseaps,mishaps +shnes,shes +shnistes,astonishes +shoeittled,shuttled +sholam,shalom +shoully,should +shovers,shivers +sibuttran,attribution +sier,sigher +siner,siren +sinis,sinus +sintal,sinistral +sionize,ionize +siosping,spinning +sisaced,sis aced +sisation,satiation +sised,sides +sises,sises +sive,chive +sizesse,sizes +skads,squads +skely,Skelly +sker,skier +skiess,skies +skindly,kindly +sking,sling +skism,schism +skizats,skits +slambouria,tambourine +slatoxins,antitoxins +slint,slit +slishi,Salish +slizate,satellite +sloptry,slop try +sluall,squall +sluberisic,tuberculosis +sludial,sundial +slypolity,sly polity +slyzated,slatted +sment,sent +smuns,suns +smusce,muscle +snerial,serial +snes,tines +sness,snugness +snicy,spicy +snisaller,installer +sobelent,somnolent +socoholy,chocoholic +sogrands,so grands +soler,choler +soperize,pauperize +soriatied,sortied +soried,dories +sorship,worship +sosered,sobered +sotoks,Soto +sourked,soured +spagore,spa gore +sparic,sparing +spegerlism,specialism +spelly,spell +spely,spell +sper,aper +sperwarin,whispering +sphaezed,emphasized +sphiens,sphinx +sphingisid,springing +sphologgly,graphology +sphy,spy +spicolle,spicule +spilizes,specializes +spiloli,spillover +splagesly,agelessly +splary,splay +splast,splats +spokene,spoken +spolto,Sposato +spor,spot +sposchises,disposes +sposs,poss +spozons,spoons +sprock,sprocket +spultin,insulting +squader,squared +sque,sue +squeff,squeeze +squing,swing +stally,stall +standers,standers +statiton,station +statoubs,stators +stax,sax +sted,cited +ster,titer +stessed,stressed +stfus,lustful +sthibece,ethicist +stigals,stigmas +stipt,stint +stirtled,startled +stocrics,aristocratic +storcicily,stormily +stortican,mortician +stral,steal +streesked,streaked +strionuar,striation +sturs,stirs +sual,dual +suatioucte,ostentatious +subbeltes,sublimates +subblenes,subtleness +sube,sub +subely,surely +subempele,subsample +subies,busies +subioss,submission +subled,sabled +subran,sub ran +subrautbly,subliterary +subscale,sub scale +substards,substandard +subte,subtle +subusing,sub using +suchly,such +suffinge,suffering +suffled,snuffled +suggy,saggy +sugs,chugs +suierion,submersion +suingerion,fingering +suivelly,illusively +sulcarity,insularity +suld,sud +sulemical,polemical +sulisaing,sustaining +suliting,suiting +sulluck,luckless +sulne,sullen +sultrad,Ultrasuede +sulumpoent,subcomponent +sulved,salved +sumaters,Sumatra +sund,duns +sunpor,support +sunts,aunts +supehand,superhuman +superiers,superiors +supers,supers +supled,suppled +suppidate,supplicate +suppider,supplier +surets,surest +surflue,surfer +surneser,sureness +suromencry,microsurgery +suroti,Grotius +surregum,surrey +surrotte,surrogate +surs,sirs +susnontat,substation +sussavely,suavely +sustings,suitings +sval,val +swarid,sward +swas,saws +swater,seater +swinfed,swindled +swirizer,Switzer +swommented,commented +swoodated,swooshed +sworiesdo,wordiness +sybrotine,subroutine +symage,massage +synay,synapse +syncoweal,syncope +syndsings,singings +syned,synced +synte,syn +syntlere,relentless +sypi,syphilis +sypirrow,sorrow +taback,aback +tablabseng,stableness +tablogon,tablespoon +tactionses,transactions +taemang,tangent +tailized,stabilized +taing,tang +tainialic,talismanic +taligs,tali gs +taliraft,tali raft +talted,tasted +tanne,tanner +tanpal,tan pal +tany,tan +tarialoval,variational +taskes,tasked +tatien,patient +tatio,ratio +tedal,teal +tedatinged,ingratiated +teed,teed +teethely,teethe +teggred,regretted +teleming,teeming +tels,lets +tely,rely +temaute,automate +temisence,reminiscence +tenethes,tenthes +tenying,tenting +terally,laterally +teranduss,adulterants +teraptine,terrapin +tere,tear +teream,teamster +terfam,teamster +tergical,liturgical +terocome,latecomer +tesinatic,Teresina +tesplubble,bestubble +thearings,hearings +theming,theeing +thented,tented +thents,thens +thetagered,tethered +thistioter,thirster +thmes,themes +thocatexor,thoriate +tholoons,tho loons +thoollurof,thoughtfully +thos,thews +thota,theta +thots,tots +thousess,outhouses +thwasts,thwarts +ticationg,mastication +ticithoty,catholicity +ticons,icons +tiess,chess +tincuen,tincture +tion,son +tions,sons +tivers,rivets +toethorier,authorizer +togic,tonic +togiciving,Livingston +tolimanes,tourmalines +tolurecla,declarator +tolutradve,autoloader +tomened,omened +tonity,tonality +tontion,intonation +toplicies,topicalities +toriate,thoriate +torint,tor int +tornity,torridity +torran,tor ran +tors,torch +tortforat,Stratford +torth,troth +tosm,toms +totheles,toothless +totionizes,tot ionizes +toused,touched +tradip,triad +trads,drats +trainged,trained +tralize,neutralize +trameads,trammed +tramisess,pastramis +tramyter,tetrameter +trardly,towardly +trast,tarts +trates,rates +tratin,train +tratives,narratives +tratoper,operator +trattes,tatters +trechpose,troposphere +treeness,treeless +trepled,tripled +trianes,triages +tric,tic +trines,tribes +tring,trig +trinon,triton +tripal,trial +trited,trites +triuss,truss +trive,trove +trizare,bizarre +trizing,tricing +trofes,trodes +trolisat,trollish +tromagnody,magnetron +trons,tons +troodism,tropism +tros,tors +trotchous,treacherous +truming,truing +trus,troughs +tryo,tyro +tuatallac,tactual +tuate,actuate +tubleys,tubules +tubstuaing,substituting +tudged,trudged +tudium,tedium +tudog,Tudor +tuncturgi,tincture +turacked,tracked +turalizer,naturalize +turgolling,trolling +turrace,terrace +turrogerns,sturgeons +turtoryoss,torturous +tuser,user +twass,swats +twebles,trebles +tweregaing,regathering +twompecto,twopence +twor,trow +tworing,trowing +twory,worry +ulgalic,Gallic +ully,illy +ultalls,ult alls +unablarid,unabridged +unarts,Stuarts +unate,lunate +unaters,natters +unbatedis,unbated is +uncather,Catherin +unch,inch +uncings,cunnings +uncizes,uncivilizes +uncoes,ounces +unctunds,unctuous +uncy,ency +uncychang,unchanging +undarial,industrial +unde,nude +undeces,undeceive +undess,undress +undested,unrested +undetimies,undermines +undion,union +undled,bundled +undruroch,Durocher +unds,funds +unducker,ducker +unection,unction +uness,unless +ungn,unsung +unheamnes,unharness +unianemic,unmnemonic +unimpant,unimportant +uniness,puniness +unisumbate,unabated +unkban,unbans +unkne,unkink +unktrytee,junketeer +unly,inly +unmonetes,monetizes +unnesst,dunnest +unnilly,unfunnily +unparangly,paralyzingly +unpolot,polyglot +unporksh,uncorks +unposohy,unperson +unprear,uprear +unreass,unreason +unrebra,unreal +unreed,unread +unrents,unrests +unrial,urinal +unscuter,undercut +unsiarache,huarache +unsibly,sensibly +unsidize,unsubsidized +unsises,unisexes +unsishetor,transistor +unspection,inspection +unsph,spun +unstage,upstage +unsubces,unsubtle +unsubring,unsurprising +unsulitrus,insulins +untaring,unstaring +untary,unary +unten,untented +unthence,thence +untiatic,undramatic +untock,unlock +untral,antral +unts,nuts +unvageost,savagest +unve,univ +unwhir,whir +uped,upend +uperypen,supermen +uria,curia +uter,uteri +utfurights,outfights +uttory,statutory +vabent,absent +vablessyn,livableness +vage,gave +vago,ago +vairment,impairment +vardes,varies +vars,cars +vass,bass +vateres,enervates +vationat,titivation +vatohype,hypoactive +veamated,amalgamated +velcep,veep +vele,elev +velise,valise +velogismas,neologisms +vely,levy +vene,even +venes,venues +venithts,Venita +ventled,vented +verar,velar +vers,revs +versessel,verses +viaticing,vindicating +viations,aviations +vigalise,valise +vilbuffs,buffs +vilicably,applicably +viling,living +ving,bing +vint,int +visana,Visayans +visoar,visor +vite,Vite +viteleisal,televisual +vively,vilely +vocanister,canister +vocile,voile +vocrailly,vocally +voichent,enrichment +voring,roving +vots,cots +vounhow,townhouse +vout,bout +vules,ovules +vulity,vulgarity +vulowni,clownish +wablion,ablation +wadoming,Dominga +waff,quaff +waints,wains +wancom,wan com +wanterism,wanter ism +warger,wager +wariser,warier +warmorsess,warhorses +waspolis,was polis +watessm,waterless +watichity,Wichita +wattry,watery +waye,way +wayst,ways +weaking,waking +weate,weave +weatic,beatific +weelarist,secularist +weer,queer +weiga,weigh +weiganka,Weizmann +welast,we last +weles,wees +wely,welt +wentriong,contrition +wernbar,Werner +whab,what +whasth,Shasta +whemine,Hermine +wherely,whereby +whes,shew +wheve,whee +whicatene,whitener +whierings,whisperings +whinaride,rawhide +whindefars,whiners +whing,whig +whise,whose +wholamize,wholemeal +wholotios,Lotharios +whoreliese,whorehouse +wicheacy,headache +wimune,immune +winist,winiest +wircul,circular +wiseme,wise me +witing,wighting +wiversor,Rivers +wize,wizen +wogy,logy +wolic,colic +woroads,workloads +woromulver,wolverine +worted,worsted +wourds,words +woutize,routinize +wragamic,tragicomic +wreacts,reacts +wrer,weer +writanued,writhed +wroked,worked +xally,laxly +yakele,Blakeley +yely,rely +yism,ism +yons,tons +yoprotah,taproot +zabjuvely,Juvenal +zimatop,maintop +zingled,zinged +zorewrie,Zorine +zworepent,repentant +zwory,Zworykin diff --git a/tests/corpora/homophones.csv b/tests/corpora/homophones.csv new file mode 100644 index 000000000..b0da8b0e5 --- /dev/null +++ b/tests/corpora/homophones.csv @@ -0,0 +1,1888 @@ +word1,word2 +Abel,able +Ayr,air +Ayr,heir +Ayr,ayre +Ayr,eyre +Ayr,aire +Ayr,are +Baal,bale +Ceres,series +Czech,cheque +Czech,check +Dane,deign +Fax,facts +Finn,fin +Gael,gale +Hyde,hide +I,aye +I,eye +I,ai +Jewry,jury +Lapps,lapse +Lett,let +Letts,lets +Maine,main +Maine,mane +Ne,neigh +Ne,nay +Ne,nae +Ne,nee +Paul,pawl +Rome,roam +Sikh,seek +Sunday,sundae +Taurus,torus +Unix,eunuchs +Wacs,whack +Whig,wig +Winnie,whinny +a,eh +a while,awhile +absents,absence +abstinents,abstinence +accepter,acceptor +accidence,accidents +acclamation,acclimation +add,ad +adds,adze +adds,ads +ade,aide +ade,aid +adherents,adherence +adieu,ado +adolescents,adolescence +adulterous,adulteress +adventurous,adventuress +adze,ads +aerie,eyrie +aerie,airy +aery,aerie +aery,eyrie +aery,airy +aes,ayes +aide,aid +ails,ales +air,heir +air,ayre +air,eyre +air,aire +air,are +aire,are +aisle,isle +aisle,I'll +ale,ail +all,awl +allowed,aloud +already,all ready +altar,alter +altogether,all together +analyst,annalist +ante,aunty +anti-,auntie +anti-,ante +anti-,aunty +appetite,apatite +apps,apse +ar,are +ariel,aerial +ark,arc +armor,armer +as,asse +ascent,assent +ass,as +ass,asse +assistance,assistants +ate,ait +attendance,attendants +auger,augur +aught,ought +auk,awk +aunt,ant +auntie,ante +auntie,aunty +autarchy,autarky +aweigh,away +awled,auld +axil,axle +axil,axel +axle,axel +axseed,accede +aye,eye +aye,ai +ayre,eyre +ayre,aire +ayre,are +b,bee +b,be +baa,bah +babble,babel +baching,batching +bade,bad +baht,bot +bail,Baal +bail,bale +bailee,bailie +bailey,bailee +bailey,bailie +bailor,baler +bailor,bailer +bair,bare +bait,bate +baited,bated +baled,bailed +baler,bailer +ball,bawl +balled,bawled +balled,bald +baloney,bologna +band,banned +bann,ban +bar,barre +bard,barred +baron,barren +barque,bark +base,bass +basest,bassist +basil,basal +bask,basque +basses,bases +baste,based +batch,bach +batched,bached +bating,baiting +batt,bat +bawd,baud +bawled,bald +bays,baize +beach,beech +bean,been +bear,bair +bear,bare +bear,bare +beau,bow +bee,be +been,bin +beet,beat +beetle,betel +bel,belle +bel,bell +belle,bell +belligerence,belligerents +berry,bury +berth,birth +bettor,better +bey,bay +beys,bays +beys,baize +bi,by +bi,buy +bier,beer +billed,build +bine,been +bine,bin +bird,burred +birr,bur +birr,burr +bise,bees +bite,bight +bitt,bit +bizarre,bazaar +blair,blare +blend,blende +block,bloc +blue,blew +boarder,border +body's,bodies +boer,bore +boer,boar +bogey,bogy +bogey,bogie +bogy,bogie +bold,bowled +bole,boll +bomb,balm +bonze,bonds +booty,bootee +booze,boos +bore,boar +bored,board +born,bourne +borough,burrow +borough,burro +bough,bow +boulder,bolder +boullion,bullion +bourn,born +bourn,bourne +bowed,bode +bowl,bole +bowl,boll +box,bocks +boy,buoy +brae,bray +braid,brayed +braise,braes +braking,breaking +brassie,brassy +brays,braze +brays,braise +brays,braes +braze,braise +braze,braes +bread,bred +break,brake +bree,brie +breech,breach +brewed,brood +bridal,bridle +brows,browse +bruise,brews +bruit,brute +bruit,brut +brunette,brunet +brute,brut +buccal,buckle +bundt,bunt +bur,burr +burd,bird +burd,burred +burg,berg +buries,berries +burly,burley +burrow,burro +bus,buss +bused,bussed +bust,bused +bust,bussed +butt,but +butte,beaut +buyer,byre +by,buy +bye,bi +bye,by +bye,buy +byte,bite +byte,bight +c's,cees +c's,seize +c's,seas +c's,sees +cache,cash +caches,cashes +caddy,cadi +caddy,caddie +cadi,caddie +caird,cared +calculous,calculus +calendar,calender +calix,calyx +calk,caulk +call,caul +callous,callus +camara,camera +came,kame +cane,Cain +canine,K9 +cannon,canon +canopy,canape +cant,can't +canter,cantor +canvas,canvass +canvases,canvasses +capital,capitol +caret,karat +caret,carat +carol,carrel +carpel,carpal +carries,caries +carrot,caret +carrot,karat +carrot,carat +cashed,cached +cashew,cachou +cashing,caching +cask,casque +cast,caste +caster,castor +caudle,caudal +cause,caws +ceder,cedar +ceding,seeding +cee,c +cees,seize +cees,seas +cees,sees +ceil,seel +ceiling,sealing +cel,sell +cell,cel +cell,sell +cellar,seller +censer,censor +censer,sensor +censer,senser +censor,sensor +censor,senser +center,scenter +cents,cense +cerate,serrate +cere,sear +cere,seer +cereal,serial +cession,session +cete,seat +chaise,shays +champaign,champagne +chance,chants +chantey,chanty +chantey,shanty +chanty,shanty +chare,chair +charred,chard +chary,cherry +chased,chaste +cheap,cheep +cheque,check +chew,Chou +chilly,chili +chin,chine +chitin,chiton +chitin,kitten +chiton,kitten +choc,chock +choir,quire +choose,chews +chorale,corral +chord,cord +chordate,cordate +chrism,chrisom +chucker,chukker +chuff,chough +cinque,sync +cinque,sync +cirrus,serous +cirrus,cirrous +cist,cyst +cist,kissed +citable,sightable +cited,sited +cited,sighted +clacque,clack +clairvoyants,clairvoyance +clamber,clammer +clamber,clamor +clammer,clamor +clapboard,clabbered +clause,claws +click,clique +clime,climb +clothes,close +clue,clew +coal,cole +coal,kohl +coarse,course +coarse,corse +cocoa,coco +coddling,codling +cokes,coax +col,call +col,caul +cole,kohl +collar,choler +collared,collard +colonel,kernel +come,cum +competents,competence +complacent,complaisant +complement,compliment +complemented,complimented +complete,compleat +complimenting,complementing +concenter,consenter +condescendents,condescendence +confectionary,confectionery +conk,conch +conn,khan +conn,con +conquer,conker +consent,concent +consulter,consultor +continence,continents +coo,coup +coolly,coulee +coolly,coolie +copped,copt +cops,copse +coquette,coquet +coral,choral +cored,chord +cored,cord +corps,core +correspondents,correspondence +cosine,cosign +cosy,cozy +cosy,cosey +cote,coat +cougher,coffer +coulee,coolie +councilor,counselor +councilor,counsellor +councilor,councillor +counsel,council +counsellor,councillor +counselor,counsellor +counselor,councillor +course,corse +cowered,coward +cox,cocks +coy,koi +cozen,cousin +cozy,cosey +crater,krater +creak,creek +crepe,crape +crewed,crude +crewel,cruel +crews,cruise +crock,croc +cross,crosse +crumby,crummie +crummy,crumby +crummy,crummie +cubit,qubit +cue,queue +cue,q +cue,queue +culler,color +currant,current +cursor,curser +curtesy,courtesy +cygnet,signet +cymbal,symbol +cypress,Cyprus +dammed,damned +damn,dam +daze,days +dear,deer +dee,d +deli,Delhi +deme,deem +dene,dean +dentil,dental +dents,dense +dependence,dependents +deprivation,deprevation +descendant,descendent +descendence,descendents +despondence,despondents +dessert,desert +devel,devle +devil,devel +devil,devle +devisor,deviser +dhole,dole +died,dyed +dine,dyne +disc,disk +discreet,discrete +divisor,devisor +divisor,deviser +do,due +do,dew +do,dough +dock,doc +doe,do +doe,dough +does,doughs +door,dor +douce,deuce +dour,dower +douse,dowse +doze,does +doze,doughs +drachm,dram +draft,draught +droop,drupe +dual,duel +ducked,duct +due,dew +dun,done +dye,die +dyeing,dying +dyer,dire +dyke,dike +e's,ease +eerie,Erie +eight,ate +eight,ait +ell,el +em,m +emerge,immerge +emerged,immerged +emerging,immerging +epoch,epic +equator,equater +equivalence,equivalents +ere,err +ere,Ayr +ere,air +ere,heir +ere,ayre +ere,eyre +ere,aire +ere,are +erne,earn +err,Ayr +err,air +err,heir +err,ayre +err,eyre +err,aire +err,are +eruption,irruption +eruptive,irruptive +eve,eave +ewe,you +ewe,yew +ewes,use +ewes,yews +ex,x +expedients,expedience +eye,ai +eyed,I'd +eyelet,islet +eyer,ire +eyes,aes +eyes,ayes +eyre,aire +eyre,are +eyrie,airy +faery,fairy +fain,feign +faker,fakir +fane,fain +fane,feign +fare,fair +faring,fairing +faro,pharaoh +farrow,faro +farrow,pharaoh +fate,fete +fated,feted +fatelist,fatalist +fawn,faun +fay,fey +faze,fays +fazed,phased +feal,feel +feal,feel +feaze,fees +feet,feat +feeze,feaze +feeze,fees +feint,faint +felloe,fellow +ferri-,ferry +ferule,ferrule +fest,fessed +few,phew +fiance,fiancee +fie,phi +file,faille +fillet,filet +fils,fills +find,fined +finish,Finnish +firs,furs +firs,furze +fisher,fissure +fishing,phishing +fizz,phiz +flair,flare +flak,flack +flea,flee +flecks,flex +flesh,fleche +flew,flu +flew,flue +flock,floc +flocks,phlox +floe,flow +florescence,fluorescence +florescence,florescents +flour,flower +flows,floes +flu,flue +fluorescence,florescents +flyer,flier +fold,foaled +fore,four +fore,for +forebear,forbear +forego,forgo +foreward,forward +forme,form +fort,forte +fother,father +foul,fowl +four,for +fourth,forth +franc,frank +frays,fraise +freeze,frieze +freeze,frees +frier,friar +frieze,frees +frits,fritz +fro,froe +fryer,frier +fryer,friar +fuhrer,furore +fuhrer,furor +fungous,fungus +furore,furor +furry,firry +furs,furze +g,gee +gaff,gaffe +gage,gauge +gait,gate +gaited,gated +gaiter,gator +galley,gally +gallop,galop +gamble,gambol +gammon,gamin +gased,gast +gassed,gased +gassed,gast +gat,gaht +gaze,gays +gest,jest +gilled,gild +gin,jinn +glair,glare +glary,glairy +gnu,knew +gnu,nu +gnus,news +gopher,gofer +gored,gourd +gorilla,guerilla +gourde,gourd +gradience,gradients +graft,graphed +grate,great +gray,grey +grayed,grade +graze,grays +grease,Greece +greaves,grieves +greyer,grayer +grievance,grievants +grieve,greave +grip,grippe +grisly,grizzly +grocer,grosser +grow,gros +grown,groan +guarantee,guaranty +guessed,guest +guild,gilled +guild,gild +guilt,gilt +gunwhale,gunnel +guyed,guide +guys,guise +gyve,jive +h,aitch +ha ha,ha-ha +hae,hay +hail,hale +hair,hare +handmaid,handmade +handsome,hansom +hanger,hangar +hart,heart +haul,hall +have,halve +haves,halves +haze,hays +he'd,heed +heal,heel +heal,he'll +heald,heeled +healed,heald +healed,heeled +heard,herd +hears,here's +heel,he'll +height,hight +heir,ayre +heir,eyre +heir,aire +heir,are +heirless,airless +herds,hurds +here,hear +heroin,heroine +hertz,hurts +hey,hae +hey,hay +hi,hie +hi,heigh +hic,hick +hie,heigh +hied,Hyde +hied,hide +high,hi +high,hie +high,heigh +higher,hire +hike,haik +him,hymn +hissed,hist +hoar,whore +hock,hough +hock,hoc +hoe,whoa +hoe,ho +hoes,hose +hold,holed +hole,whole +holm,hom +holy,holey +home,holm +home,hom +horde,whored +horde,hoard +horse,hoarse +hostile,hostel +hough,hoc +hour,our +how's,house +hows,how's +hows,house +hue,hew +hues,hews +humorous,humerus +idle,idyll +idle,idol +idyll,idol +illicit,elicit +immanent,imminent +immergence,emergence +immergences,emergences +immerges,emerges +impassable,impassible +impotents,impotence +in,inn +incandescents,incandescence +incidence,incidents +incite,insight +incompetence,incompetents +independence,independents +indict,indite +inductants,inductance +inflection,inflexion +infuser,infusor +ink,Inc. +innocents,innocence +inns,ins +insightful,inciteful +instance,instants +instillation,installation +insurgence,insurgents +intense,intents +invade,inveighed +irk,erk +isle,I'll +its,it's +jabb,jab +jamb,jam +jay,j +jean,gene +jell,gel +jibe,gibe +jinx,jinks +joule,jewel +junkie,junky +karat,carat +kay,k +kayak,kyack +key,cay +khan,con +khat,cot +kill,chyle +kiln,kill +kiln,chyle +kite,kyte +knar,gnar +knaves,naves +knead,kneed +kneading,needing +knee,nee +knew,nu +knickers,nickers +knobs,nobs +knot,not +know-how,no-how +knows,nose +kop,cop +kor,corps +kor,core +kraft,craft +krone,krona +kurd,curd +laager,logger +lac,lack +lacks,lax +lade,leid +lade,laid +lager,laager +lager,logger +lain,lane +lama,llama +lamb,lam +lance,launce +lap,Lapp +laps,Lapps +laps,lapse +lase,leys +lase,lays +laser,lazar +laser,lazer +latches,laches +lay,ley +lay,lei +lazar,lazer +laze,lase +laze,leys +laze,lays +leaches,leeches +lead,led +leaf,lief +leans,liens +leas,laze +leas,lase +leas,leys +leas,lays +leased,least +leaver,lever +ledger,leger +lee,lea +leech,leach +leek,leak +leer,Lear +lees,leas +leid,laid +leis,leas +leis,laze +leis,lase +leis,leys +leis,lays +lesson,lessen +lessor,lesser +let's,Letts +let's,lets +levee,levy +levies,levees +levin,leaven +ley,lei +leys,lays +li,lee +li,lea +liar,lyre +liar,lier +lice,lyse +lichen,liken +licht,licked +lien,lean +lieu,loo +lieve,leave +lightening,lightning +limbs,limns +limn,limb +liquor,licker +liquors,lickers +littoral,literal +llamas,lama +lo-cal,locale +load,lowed +loath,loathe +loch,lakh +lochs,locks +lochs,lox +lock,lough +lock,loch +lock,lakh +locks,lox +lode,load +lode,lowed +loess,less +lone,loan +loner,loaner +loose,luce +loot,lute +loots,lutes +lose,loos +lough,loch +lough,lakh +loupe,loop +low,lo +lune,loon +lux,luxe +lye,lie +lynx,links +lyre,lier +made,maid +magnet,magnate +magnificence,magnificents +mail,male +mails,males +main,mane +maize,maze +malevolence,malevolents +mall,maul +mandrel,mandrill +manila,manilla +manners,manors +manor,manner +mantle,mantel +marc,marque +marcs,Marx +mark,marc +mark,marque +marks,marques +marks,marcs +marks,Marx +marques,marcs +marques,Marx +marquis,marquee +marshal,martial +marten,martin +masque,mask +mast,massed +matt,mat +matte,matt +matte,mat +maybe,may be +mead,meed +mean,mien +mean,mesne +meat,meet +meatier,meteor +meddle,medal +meddles,medals +mete,meat +mete,meet +meting,meeting +mettle,metal +mewl,mule +mews,muse +mhos,mows +mi,me +mien,mesne +mill,mil +millinery,millenary +mind,mined +miner,minor +miners,minors +mints,mince +minx,minks +miscible,missable +missal,missile +missals,missiles +misses,Mrs. +missus,misses +missus,Mrs. +mist,missed +mite,might +mix,micks +moat,mote +mock,mach +mole,mol +moo,moue +mood,mooed +mordant,mordent +morn,mourn +mots,mhos +mots,mows +mourning,morning +mousse,moose +mow,mot +mowed,mode +mown,moan +much,mutch +mucus,mucous +mur,myrrh +murderous,murderess +muscles,mussels +musket,muscat +mussed,must +mussel,muscle +mustered,mustard +mutual,mutuel +n,en +nae,nee +nap,knap +narc,nark +navel,naval +nay,nae +nay,nee +neap,neep +need,knead +need,kneed +needed,kneaded +needs,kneads +neigh,nay +neigh,nae +neigh,nee +new,gnu +new,knew +new,nu +nice,gneiss +nickle,nickel +night,knight +nit,knit +nits,knits +nix,nicks +no,know +nob,knob +nock,knock +noes,knows +noes,nose +non-residence,non-residents +none,nun +nonexistents,nonexistence +noose,nous +obedients,obedience +odd,od +ode,owed +oh,owe +ohs,owes +oke,oak +one,won +oohs,ooze +or,oar +oracle,auricle +oral,aural +ore,or +ore,oar +ores,oars +ours,hours +overdue,overdo +overrate,overate +oversees,overseas +pa,pas +pa,pah +packed,pact +packs,pax +paid,payed +pair,pare +paired,pared +palate,palette +pale,pail +pall,Paul +pall,pawl +pallet,palate +pallet,palette +pan,panne +pane,pain +paned,pained +paring,pairing +parol,parole +parred,pard +pas,pah +passable,passible +past,passed +paste,paced +paten,patten +patience,patients +paws,pause +pea,p +pea,pee +pea,pe +peace,piece +peak,peke +peak,peek +peaked,peeked +peal,peel +pealing,peeling +peals,peels +pear,pere +pear,pair +pear,pare +peas,pease +peas,pees +pease,pees +pecten,pectin +pedal,peddle +pedalled,peddled +peddles,pedals +pee,pe +peeled,pealed +peen,pean +peke,peek +pencel,pencil +pend,penned +penitence,penitents +pennants,penance +penny,penni +pensile,pencel +pensile,pencil +pere,pair +pere,pare +permanents,permanence +pew,piu +pharming,farming +phase,faze +phase,fays +phases,fazes +phasing,fazing +phat,fat +phenyl,fennel +phial,file +phial,faille +philter,filter +phosphorescents,phosphorescence +phosphorus,phosphorous +phrase,frays +phrase,fraise +phreak,freak +pi,pie +pic,pick +pica,pika +pidgin,pigeon +pier,peer +pique,peak +pique,peke +pique,peek +piqued,peaked +piqued,peeked +pistil,pistol +pix,pyx +pix,picks +pixie,pyxie +place,plaice +plainer,planer +plait,plate +planar,plainer +planar,planer +planar,planer +plane,plain +planes,plains +plantar,planter +plater,plaiter +plating,plaiting +pleas,please +plumb,plum +plumbs,plums +plural,pleural +pocks,pox +poise,pois +poky,pokey +poled,polled +poler,poller +poler,polar +police,pelisse +politick,politic +poll,pole +poller,polar +pompous,pampas +poof,pouffe +populous,populace +populus,populous +populus,populace +pored,poured +poring,pouring +potpourri,popery +potty,pottie +pouf,poof +pouf,pouffe +pour,pore +pours,pores +praise,prays +prase,preys +prase,praise +prase,prays +pray,prey +prayed,preyed +preadolescents,preadolescence +precedence,precedents +precisian,precision +premiere,premier +presents,presence +presser,pressor +prest,pressed +preyers,prayers +preying,praying +preys,praise +preys,prays +pride,pried +pries,prise +principle,principal +prints,prince +prior,prier +prize,pries +prize,prise +prophesy,prophecy +prophet,profit +pros,prose +protean,protein +psi,xi +pubescence,pubescents +pumice,pomace +pupil,pupal +pure,puer +purist,purest +purl,pearl +purr,per +purse,perse +puttees,putties +putty,puttee +pyknic,picnic +pyx,picks +q,queue +quartz,quarts +quay,key +quay,cay +quay,key +quean,queen +quints,quince +quod,quad +quoin,coin +rabbit,rabbet +rack,wrack +racked,wracked +rackett,racket +racks,wracks +racks,rax +racquet,rackett +racquet,racket +radical,radicle +raid,rayed +rained,reined +rains,reins +raise,rays +rale,rail +rang,wrang +ranker,rancor +rap,wrap +rapped,rapt +rapping,wrapping +raps,wraps +rase,raze +rase,raise +rase,rays +ray,re +raze,raise +raze,rays +razed,raised +razer,razor +razer,raiser +razor,raiser +reading,reeding +reave,reeve +reave,rieve +rebuild,rebilled +recede,reseed +receding,reseeding +recite,resight +recited,resighted +red,read +redd,red +redd,read +rede,reed +rede,read +reed,read +reel,real +reeve,rieve +reign,rein +reign,rain +reigned,rained +reigned,reined +reigns,rains +reigns,reins +rein,rain +relayed,relaid +remark,remarque +remittance,remittants +repeal,repeel +repost,riposte +reseat,receipt +reseeded,receded +resewed,resowed +resewn,resown +resews,resows +resighting,resiting +resighting,reciting +resistants,resistance +resistor,resister +resite,recite +resite,resight +resited,recited +resited,resighted +resiting,reciting +resonate,resinate +rest,wrest +rests,wrests +retarred,retard +review,revue +revues,reviews +rheumy,roomy +rheumy,roomie +rho,row +rigger,rigor +right,rite +right,write +rights,wrights +rights,rites +rights,writes +rigors,riggers +rime,rhyme +ring,wring +ringer,wringer +rings,wrings +riot,ryot +rite,write +rites,writes +road,rowed +road,rode +rock,roc +rocky,raki +roe,rho +roe,row +roe,row +roles,rolls +roll,role +rondo,rondeau +roo,roux +rood,rude +rood,rued +roomer,rumor +roomy,roomie +root,route +roots,routes +rose,rows +rose,roes +rosery,rosary +rote,wrote +rough,ruff +rough,ruff +rout,route +routed,rooted +routing,rooting +rowed,rode +rows,roes +rows,rouse +rude,rued +rude,rued +rue,roo +rue,roux +rues,roos +rues,ruse +rum,rhumb +ruse,rues +ruse,roos +rye,wry +ryes,rise +rynd,rind +ryse,ryes +ryse,rise +s,ess +sac,sack +sac,sacque +saccharine,saccharin +sachet,sashay +sack,sacque +sacks,sacs +sale,sail +sales,sails +salter,psalter +salvor,salver +samite,psammite +sandhi,sandy +saner,seiner +sank,cinque +sari,sorry +saver,savor +savior,saviour +sax,sacks +sax,sacs +scaler,scalar +scend,send +scene,seen +scent,sent +scent,cent +scents,cents +scents,cense +se,say +sea,cee +sea,c +seal,ceil +seal,seel +seamen,semen +sear,seer +sear,seer +seas,sees +seas,sees +seas,seize +sects,sex +see,sea +see,cee +see,c +see,sea +seed,cede +seeder,ceder +seeder,cedar +seem,seam +seemed,seamed +seems,seams +sees,seize +seine,sane +seize,seas +seize,sees +senate,sennit +sennet,senate +sennet,sennit +sense,scents +sense,cents +sense,cense +sensing,censing +sensor,senser +sent,cent +sequence,sequents +sere,cere +sere,sear +sere,seer +sere,sear +sere,seer +serif,seraph +serous,cirrous +settlor,settler +settlors,settlers +sew,so +sewed,sowed +sewer,suer +sewer,sower +sewn,sone +sewn,sown +sext,sexed +shake,sheik +she's,shes +shea,she +sheer,shear +sheers,shears +sheik,chic +shifery,shivary +shifery,shivaree +shivary,shivaree +shoe,shoo +shoes,shoos +shooed,shoed +shoot,chute +shown,shone +sibilance,sibilants +sick,sic +sics,six +sie,scye +sigh,psi +sigh,xi +sigh,sie +sigh,scye +sighed,side +sight,cite +sighting,siting +sighting,citing +sights,cites +sign,sine +sign,syne +signs,sines +silence,silents +sine,syne +sink,cinque +sink,sync +sink,cinque +sink,sync +sinking,synching +sioux,sou +sire,sigher +sitable,citable +sitable,sightable +site,sight +site,cite +sited,sighted +sites,sights +sites,cites +siting,citing +size,sighs +skull,scull +slaying,sleighing +sleeve,sleave +sleigh,slay +sleighed,slayed +sleight,slight +sloe,slow +slough,slue +slough,slew +slough,sluff +slue,slew +snees,sneeze +soared,sword +soke,soak +sole,soul +sole,sol +soled,sold +sone,sown +sons,suns +soot,suit +sore,soar +sou,sault +soul,sol +sow,sew +sow,so +spay,spae +spayed,spade +speck,spec +specs,specks +spits,spitz +spore,spoor +squalor,squaller +staid,stayed +staid,stade +stakes,steaks +stanch,staunch +staph,staff +stare,stair +starlit,starlet +stater,stator +statice,status +stationery,stationary +stayed,stade +steak,stake +steal,steel +steal,stele +steel,stele +steeling,stealing +step,steppe +steppes,steps +stere,steer +sticks,Styx +stile,style +stoop,stoup +stoops,stoup +storey,story +straight,strait +straiten,straighten +stridor,strider +stupe,stoop +stupe,stoup +stupes,stoops +stupes,stoup +sty,stye +succulents,succulence +sucker,succor +sue,sou +sue,sault +sue,sioux +sue,sou +sues,sous +sum,some +summary,summery +sun,sunn +sun,son +sunn,son +sunny,sonny +surf,serf +surge,serge +sutler,subtler +swat,swot +swayed,suede +sweet,suite +sweets,suites +sycosis,psychosis +t,ti +t,tee +tack,tach +tacked,tact +tacks,tax +tail,tale +tail,tael +tailer,tailor +tails,tales +tale,tael +tapir,taper +tar,tahr +tare,tear +tartar,tarter +taut,taught +tea,t +tea,ti +tea,tee +teal,teil +teamed,teemed +teaming,teeming +teams,teems +tear,tier +teas,tis +teas,tease +teat,teet +teem,team +tees,teas +tees,tis +tees,tease +tell,tel +tenor,tenner +tenser,tensor +tents,tense +terne,tern +terse,terce +thee,the +their,there +their,they're +theirs,there's +there,they're +therefore,therefor +threw,through +throe,throw +throes,throws +throne,thrown +thru,threw +thru,through +ti,tee +tic,tick +ticks,tics +tide,tied +tie,tye +tie,Thai +tighten,titan +tighter,titer +till,til +timbre,timber +timbre,tambour +time,thyme +tis,tease +tix,ticks +tix,tics +to,too +to,two +toady,tody +tocsin,toxin +toed,towed +toed,toad +told,tolled +tole,toll +ton,tonne +too,two +tool,tulle +tooter,tutor +tope,taupe +tor,tore +torah,tora +torcher,torture +torte,tort +toughed,tuft +tow,toe +towed,toad +tower,toer +tows,toes +tract,tracked +traitoress,traitorous +trayed,trade +trays,treys +tres,trey +tres,tray +trey,tray +trochee,troche +trooper,trouper +troupe,troop +trussed,trust +trusty,trustee +try,tri +tucks,tux +tuff,tough +tule,tool +tule,tulle +tun,ton +tun,tonne +tune,toon +tung,tongue +turban,turbine +turn,terne +turn,tern +turnery,ternary +tutors,tooters +tye,Thai +tyer,tyre +tyer,tire +typhous,typhus +tyre,tire +uh,a +uncle,uncal +undue,undo +unreal,unreel +urn,erne +urn,earn +use,yews +vail,veil +vail,vale +valence,valance +vane,vein +vane,vain +variance,variants +veil,vale +vein,vain +venous,Venus +versed,verst +vice,vise +vile,viol +vile,vial +villus,villous +viol,vial +viscous,viscus +wack,wax +wack,Wacs +wack,whack +wade,weighed +wail,whale +wailer,waler +waist,waste +wait,weight +waits,weights +waive,wave +waiver,waver +wale,wail +wale,whale +wane,wain +war,wore +ware,where +ware,wear +ware,weir +warrantee,warrranty +warred,ward +wart,wort +wasted,waisted +wastes,waists +watt,what +waved,waived +waves,waives +waving,waiving +wax,Wacs +wax,whack +ways,weighs +we,whee +we,wee +we'd,weed +we'll,weal +we'll,wheel +we've,weave +weak,week +weakly,weekly +weal,wheel +weald,wheeled +weald,wield +wear,weir +wears,wares +weather,whether +ween,wean +weigh,whey +weigh,way +weighted,waited +weiner,weaner +weir,we're +weld,welled +wen,when +wet,whet +wether,weather +wether,whether +whacks,wax +whaler,wailer +whaler,waler +wheal,we'll +wheal,weal +wheal,wheel +whee,wee +wheeled,wield +where,wear +where,weir +where's,wears +where's,wares +whey,way +which,wich +whin,wynn +whine,wine +whined,wined +whined,wynd +whined,wind +whines,wines +whinings,winings +whir,were +whirl,whorl +whish,wish +whit,wit +whither,wither +who,hoo +whoa,ho +wholly,holy +wholly,holey +whoo,woo +whop,wop +whored,hoard +whose,who's +why,wye +whys,wise +wicker,whicker +wild,whiled +wile,while +win,whin +win,wynn +wined,wynd +wined,wind +wining,whining +winze,wins +witch,which +witch,wich +wittier,Whittier +woe,whoa +wont,won't +wood,would +word,whirred +worn,warn +worst,wurst +wracking,racking +wracks,rax +wraith,rathe +wrapped,rapped +wrapped,rapt +wrapper,rapper +wreak,reek +wreaking,reeking +wreaks,reeks +wreck,reck +wrecks,rex +wretch,retch +wright,right +wright,rite +wright,write +wrights,rites +wrights,writes +wrung,rung +wyes,whys +wyes,wise +wynd,wind +xero-,zero +y,why +y,wye +yak,yack +yawl,y'all +yoke,yolk +yokes,yolks +yore,you're +yore,your +yores,yours +you,yew +you're,your +yule,you'll +zee,z diff --git a/tests/corpora/simple-ngrams-pos.txt b/tests/corpora/simple-ngrams-pos.txt new file mode 100644 index 000000000..cfd2fa040 --- /dev/null +++ b/tests/corpora/simple-ngrams-pos.txt @@ -0,0 +1,8 @@ +the_DET 2015 20 20 +quick_ADJ 2015 2 2 +brown_ADJ 2015 3 3 +fox_NOUN 2015 1 1 +jumped_VERB 2015 3 3 +over_PREP 2015 6 6 +lazy_ADJ 2015 1 1 +dog_NOUN 2015 5 5 diff --git a/tests/corpus/test_corpus_corpus.py b/tests/corpus/test_corpus_corpus.py index fa63d060a..33a181667 100644 --- a/tests/corpus/test_corpus_corpus.py +++ b/tests/corpus/test_corpus_corpus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -31,13 +31,14 @@ import unittest from abydos.corpus import Corpus +from abydos.tokenizer import QSkipgrams class CorpusTestCases(unittest.TestCase): """Test Corpus class.""" - sotu2015Sample = 'Mr. Speaker, Mr. Vice President, Members of Congress, my\ - fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\ + sotu2015_sample = "Mr. Speaker, Mr. Vice President, Members of Congress,\ + my fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\ years that dawned with terror touching our shores; that unfolded with a\ new generation fighting two long and costly wars; that saw a vicious\ recession spread across our nation and the world.\n It has been, and still\ @@ -46,7 +47,7 @@ class CorpusTestCases(unittest.TestCase): jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\ than it was before the financial crisis.\n More of our kids are graduating\ than ever before.\n More of our people are insured than ever before.\n And\ - we are as free from the grip of foreign oil as we\'ve been in almost 30\ + we are as free from the grip of foreign oil as we've been in almost 30\ years.\n\nTonight, for the first time since 9/11, our combat mission in\ Afghanistan is over.\n Six years ago, nearly 180,000 American troops\ served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\ @@ -58,7 +59,7 @@ class CorpusTestCases(unittest.TestCase): Union is strong.\n\nAt this moment -- with a growing economy, shrinking\ deficits, bustling industry, booming energy production -- we have risen\ from recession freer to write our own future than any other nation on\ - Earth.\n It\'s now up to us to choose who we want to be over the next 15\ + Earth.\n It's now up to us to choose who we want to be over the next 15\ years and for decades to come.\n\nWill we accept an economy where only a\ few of us do spectacularly well?\n Or will we commit ourselves to an\ economy that generates rising incomes and chances for everyone who makes\ @@ -69,11 +70,11 @@ class CorpusTestCases(unittest.TestCase): be sorted into factions and turned against one another?\n Or will we\ recapture the sense of common purpose that has always propelled America\ forward?\n\nIn two weeks, I will send this Congress a budget filled with\ - ideas that are practical, not partisan.\n And in the months ahead, I\'ll\ + ideas that are practical, not partisan.\n And in the months ahead, I'll\ crisscross the country making a case for those ideas.\n So tonight, I want\ to focus less on a checklist of proposals, and focus more on the values at\ - stake in the choices before us.' - sotu2015Corpus = Corpus(sotu2015Sample, filter_chars='.?-;,:') + stake in the choices before us." + sotu2015_corpus = Corpus(sotu2015_sample, filter_chars='.?-;,:') def test_corpus(self): """Test abydos.corpus.Corpus.""" @@ -207,6 +208,27 @@ def test_corpus(self): ] ], ) + self.assertEqual( + Corpus( + 'quick', word_tokenizer=QSkipgrams(qval=3, start_stop='') + ).corpus, + [ + [ + [ + 'qui', + 'quc', + 'quk', + 'qic', + 'qik', + 'qck', + 'uic', + 'uik', + 'uck', + 'ick', + ] + ] + ], + ) def test_corpus_docs_raw(self): """Test abydos.corpus.Corpus.paras, .docs, .docs_of_words, .raw.""" @@ -254,11 +276,11 @@ def test_corpus_idf(self): wiki_idf_corpus = Corpus(wiki_idf_sample) self.assertAlmostEqual(wiki_idf_corpus.idf('this'), 0) - self.assertAlmostEqual(wiki_idf_corpus.idf('example'), 0.30102999566) + self.assertAlmostEqual(wiki_idf_corpus.idf('example'), 0.69314718056) self.assertAlmostEqual(wiki_idf_corpus.idf('these'), float('inf')) self.assertAlmostEqual(wiki_idf_corpus.idf('A'), float('inf')) self.assertAlmostEqual( - wiki_idf_corpus.idf('A', lambda w: w.upper()), 0.30102999566 + wiki_idf_corpus.idf('A', lambda w: w.upper()), 0.69314718056 ) diff --git a/tests/corpus/test_corpus_n_gram_corpus.py b/tests/corpus/test_corpus_n_gram_corpus.py index f66a02b10..87f6c5b6d 100644 --- a/tests/corpus/test_corpus_n_gram_corpus.py +++ b/tests/corpus/test_corpus_n_gram_corpus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -16,7 +16,7 @@ # You should have received a copy of the GNU General Public License # along with Abydos. If not, see . -"""abydos.tests.corpus.test_ngram. +"""abydos.tests.corpus.test_n_gram_corpus. This module contains unit tests for abydos.corpus._n_gram_corpus """ @@ -47,8 +47,8 @@ class NGramCorpusTestCases(unittest.TestCase): double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) - sotu2015Sample = 'Mr. Speaker, Mr. Vice President, Members of Congress, my\ - fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\ + sotu2015_sample = "Mr. Speaker, Mr. Vice President, Members of Congress,\ + my fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\ years that dawned with terror touching our shores; that unfolded with a\ new generation fighting two long and costly wars; that saw a vicious\ recession spread across our nation and the world.\n It has been, and still\ @@ -57,7 +57,7 @@ class NGramCorpusTestCases(unittest.TestCase): jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\ than it was before the financial crisis.\n More of our kids are graduating\ than ever before.\n More of our people are insured than ever before.\n And\ - we are as free from the grip of foreign oil as we\'ve been in almost 30\ + we are as free from the grip of foreign oil as we've been in almost 30\ years.\n\nTonight, for the first time since 9/11, our combat mission in\ Afghanistan is over.\n Six years ago, nearly 180,000 American troops\ served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\ @@ -69,7 +69,7 @@ class NGramCorpusTestCases(unittest.TestCase): Union is strong.\n\nAt this moment -- with a growing economy, shrinking\ deficits, bustling industry, booming energy production -- we have risen\ from recession freer to write our own future than any other nation on\ - Earth.\n It\'s now up to us to choose who we want to be over the next 15\ + Earth.\n It's now up to us to choose who we want to be over the next 15\ years and for decades to come.\n\nWill we accept an economy where only a\ few of us do spectacularly well?\n Or will we commit ourselves to an\ economy that generates rising incomes and chances for everyone who makes\ @@ -80,19 +80,19 @@ class NGramCorpusTestCases(unittest.TestCase): be sorted into factions and turned against one another?\n Or will we\ recapture the sense of common purpose that has always propelled America\ forward?\n\nIn two weeks, I will send this Congress a budget filled with\ - ideas that are practical, not partisan.\n And in the months ahead, I\'ll\ + ideas that are practical, not partisan.\n And in the months ahead, I'll\ crisscross the country making a case for those ideas.\n So tonight, I want\ to focus less on a checklist of proposals, and focus more on the values at\ - stake in the choices before us.' - sotu2015Corpus = Corpus(sotu2015Sample, filter_chars='.?-;,:') + stake in the choices before us." + sotu2015_corpus = Corpus(sotu2015_sample, filter_chars='.?-;,:') - sotu_ngcorpus_uni = NGramCorpus(sotu2015Corpus) + sotu_ngcorpus_uni = NGramCorpus(sotu2015_corpus) sotu_ngcorpus_tri = NGramCorpus() - sotu_ngcorpus_tri.corpus_importer(sotu2015Corpus, 3, '', '') + sotu_ngcorpus_tri.corpus_importer(sotu2015_corpus, 3, '', '') sotu_ngcorpus_5 = NGramCorpus() - sotu_ngcorpus_5.corpus_importer(sotu2015Corpus, 5, '', '') + sotu_ngcorpus_5.corpus_importer(sotu2015_corpus, 5, '', '') simple_ngcorpus_5 = NGramCorpus() simple_ngcorpus_5.corpus_importer( @@ -103,7 +103,7 @@ def test_init(self): """Test abydos.corpus.NGramCorpus.__init__.""" self.assertIsInstance(NGramCorpus(), NGramCorpus) self.assertRaises(TypeError, NGramCorpus, ['a', 'b', 'c']) - self.assertIsInstance(NGramCorpus(self.sotu2015Corpus), NGramCorpus) + self.assertIsInstance(NGramCorpus(self.sotu2015_corpus), NGramCorpus) def test_corpus_importer(self): """Test abydos.corpus.NGramCorpus.corpus_importer.""" @@ -209,19 +209,6 @@ def test_get_count(self): self.assertEqual(self.simple_corpus.get_count(['the', 'quick']), 2) self.assertEqual(self.simple_corpus.get_count(['trolley']), 0) - def test_tf(self): - """Test abydos.corpus.NGramCorpus.tf.""" - # zero case - self.assertEqual(self.sotu_ngcorpus_uni.tf('Niall'), 0) - - # simple cases - self.assertAlmostEqual(self.sotu_ngcorpus_uni.tf('the'), 2.2787536) - self.assertAlmostEqual(self.sotu_ngcorpus_uni.tf('America'), 1.4771213) - - # bigrams - self.assertRaises(ValueError, self.sotu_ngcorpus_tri.tf, 'the sense') - self.assertRaises(ValueError, self.sotu_ngcorpus_tri.tf, 'the world') - if __name__ == '__main__': unittest.main() diff --git a/tests/corpus/test_corpus_unigram_corpus.py b/tests/corpus/test_corpus_unigram_corpus.py new file mode 100644 index 000000000..62a999a65 --- /dev/null +++ b/tests/corpus/test_corpus_unigram_corpus.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.corpus.test_unigram_corpus. + +This module contains unit tests for abydos.corpus._unigram_corpus +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import os +import sys +import tempfile +import unittest +from collections import defaultdict + +from abydos.corpus import UnigramCorpus +from abydos.phonetic import Soundex +from abydos.tokenizer import QSkipgrams + +from .. import _corpus_file + + +class UnigramCorpusTestCases(unittest.TestCase): + """Test abydos.corpus.UnigramCorpus.""" + + simple_corpus = UnigramCorpus() + simple_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) + + double_corpus = UnigramCorpus() + double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) + double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) + + sotu2015_sample = "Mr. Speaker, Mr. Vice President, Members of Congress,\ + my fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\ + years that dawned with terror touching our shores; that unfolded with a\ + new generation fighting two long and costly wars; that saw a vicious\ + recession spread across our nation and the world.\n It has been, and still\ + is, a hard time for many.\n\nBut tonight, we turn the page.\n Tonight,\ + after a breakthrough year for America, our economy is growing and creating\ + jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\ + than it was before the financial crisis.\n More of our kids are graduating\ + than ever before.\n More of our people are insured than ever before.\n And\ + we are as free from the grip of foreign oil as we've been in almost 30\ + years.\n\nTonight, for the first time since 9/11, our combat mission in\ + Afghanistan is over.\n Six years ago, nearly 180,000 American troops\ + served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\ + we salute the courage and sacrifice of every man and woman in this 9/11\ + Generation who has served to keep us safe.\n We are humbled and grateful\ + for your service.\n\nAmerica, for all that we have endured; for all the\ + grit and hard work required to come back; for all the tasks that lie\ + ahead, know this: The shadow of crisis has passed, and the State of the\ + Union is strong.\n\nAt this moment -- with a growing economy, shrinking\ + deficits, bustling industry, booming energy production -- we have risen\ + from recession freer to write our own future than any other nation on\ + Earth.\n It's now up to us to choose who we want to be over the next 15\ + years and for decades to come.\n\nWill we accept an economy where only a\ + few of us do spectacularly well?\n Or will we commit ourselves to an\ + economy that generates rising incomes and chances for everyone who makes\ + the effort?\n\nWill we approach the world fearful and reactive, dragged\ + into costly conflicts that strain our military and set back our\ + standing?\n Or will we lead wisely, using all elements of our power to\ + defeat new threats and protect our planet?\n\nWill we allow ourselves to\ + be sorted into factions and turned against one another?\n Or will we\ + recapture the sense of common purpose that has always propelled America\ + forward?\n\nIn two weeks, I will send this Congress a budget filled with\ + ideas that are practical, not partisan.\n And in the months ahead, I'll\ + crisscross the country making a case for those ideas.\n So tonight, I want\ + to focus less on a checklist of proposals, and focus more on the values at\ + stake in the choices before us." + sotu2015_corpus = UnigramCorpus(sotu2015_sample) + + sdx_corpus = UnigramCorpus(word_transform=Soundex().encode) + + qsg_corpus = UnigramCorpus( + word_tokenizer=QSkipgrams(qval=3, start_stop='') + ) + + pos_corpus = UnigramCorpus() + pos_corpus.gng_importer(_corpus_file('simple-ngrams-pos.txt')) + + def test_unigram_corpus_init(self): + """Test abydos.corpus.UnigramCorpus.__init__.""" + self.assertIsInstance(UnigramCorpus(), UnigramCorpus) + self.assertIsInstance(self.sotu2015_corpus, UnigramCorpus) + + def test_unigram_corpus_gng_importer(self): + """Test abydos.corpus.UnigramCorpus.gng_importer.""" + self.assertIsInstance(self.simple_corpus, UnigramCorpus) + self.assertIsInstance(self.simple_corpus.corpus, defaultdict) + + # skip tests of UnigramCorpus on Python < 3.6 (lack ordered dict) + if sys.version_info < (3, 6): + return + + self.sdx_corpus.gng_importer('tests/corpora/simple-ngrams.txt') + self.assertEqual( + list(self.sdx_corpus.corpus.items()), + [ + ('T000', (20, 20)), + ('Q200', (2, 2)), + ('B650', (3, 3)), + ('F200', (1, 1)), + ('J513', (4, 4)), + ('O160', (6, 6)), + ('L200', (1, 1)), + ('D200', (5, 5)), + ('T220', (2, 2)), + ('Q216', (1, 1)), + ('B651', (1, 1)), + ('F251', (1, 1)), + ('O163', (3, 3)), + ('T420', (2, 2)), + ('L232', (1, 1)), + ], + ) + + self.qsg_corpus.gng_importer('tests/corpora/simple-ngrams.txt') + self.assertEqual( + list(self.qsg_corpus.corpus.items())[:30:2], + [ + ('the', (27, 27)), + ('quc', (5, 5)), + ('qic', (5, 5)), + ('qck', (5, 5)), + ('uik', (5, 5)), + ('ick', (5, 5)), + ('brw', (5, 5)), + ('bow', (5, 5)), + ('bwn', (5, 5)), + ('ron', (5, 5)), + ('own', (5, 5)), + ('jum', (5, 5)), + ('jue', (6, 5)), + ('jmp', (5, 5)), + ('jmd', (5, 5)), + ], + ) + + for term, _ in self.pos_corpus.corpus.items(): + self.assertTrue('_' not in term) + + def test_unigram_corpus_save_load_corpus(self): + """Test abydos.corpus.UnigramCorpus.save_corpus & .load_corpus.""" + handle, path = tempfile.mkstemp('.dat') + self.sotu2015_corpus.save_corpus(path) + self.sotu2015_corpus.load_corpus(path) + statinfo = os.stat(path) + self.assertGreater(statinfo.st_size, 0) + os.close(handle) + os.remove(path) + + def test_unigram_corpus_idf(self): + """Test abydos.corpus.UnigramCorpus.idf.""" + # string-style tests + self.assertAlmostEqual(self.simple_corpus.idf('the'), 0.69314718056) + self.assertAlmostEqual(self.simple_corpus.idf('quick'), 2.3978952728) + self.assertAlmostEqual(self.simple_corpus.idf('trolley'), float('inf')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance__token_distance.py b/tests/distance/test_distance__token_distance.py new file mode 100644 index 000000000..98dc4b0d1 --- /dev/null +++ b/tests/distance/test_distance__token_distance.py @@ -0,0 +1,420 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance__token_distance. + +This module contains unit tests for abydos.distance._TokenDistance +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest +from collections import Counter + +from abydos.distance import ( + AverageLinkage, + DamerauLevenshtein, + Jaccard, + JaroWinkler, + SokalMichener, +) +from abydos.stats import ConfusionTable +from abydos.tokenizer import ( + CharacterTokenizer, + QSkipgrams, + WhitespaceTokenizer, +) + + +class TokenDistanceTestCases(unittest.TestCase): + """Test _TokenDistance functions. + + abydos.distance._TokenDistance + """ + + cmp_j_crisp = Jaccard(intersection_type='crisp') + cmp_j_soft = Jaccard(intersection_type='soft') + cmp_j_fuzzy = Jaccard( + intersection_type='fuzzy', metric=DamerauLevenshtein(), threshold=0.4 + ) + cmp_j_linkage = Jaccard(intersection_type='linkage') + cmp_j_linkage_int = Jaccard( + intersection_type='linkage', internal_assignment_problem=True + ) + + def test_crisp_jaccard_sim(self): + """Test abydos.distance.Jaccard.sim (crisp).""" + # Base cases + self.assertEqual(self.cmp_j_crisp.sim('', ''), 1.0) + self.assertEqual(self.cmp_j_crisp.sim('a', ''), 0.0) + self.assertEqual(self.cmp_j_crisp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_j_crisp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_j_crisp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_j_crisp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_j_crisp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_j_crisp.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_j_crisp.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_j_crisp.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_j_crisp.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_j_crisp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_soft_jaccard_sim(self): + """Test abydos.distance.Jaccard.sim (soft).""" + # Base cases + self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0) + self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0) + self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.1) + + self.assertAlmostEqual( + self.cmp_j_soft.sim('Nigel', 'Niall'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_j_soft.sim('Niall', 'Nigel'), 0.4444444444 + ) + self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.6071428571428571 + ) + + self.assertAlmostEqual( + Jaccard(intersection_type='soft', metric=JaroWinkler()).sim( + 'synonym', 'antonym' + ), + 0.5833333333333, + ) + + def test_fuzzy_jaccard_sim(self): + """Test abydos.distance.Jaccard.sim (fuzzy).""" + # Base cases + self.assertEqual(self.cmp_j_fuzzy.sim('', ''), 1.0) + self.assertEqual(self.cmp_j_fuzzy.sim('a', ''), 0.0) + self.assertEqual(self.cmp_j_fuzzy.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_j_fuzzy.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_j_fuzzy.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_j_fuzzy.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_j_fuzzy.sim('abcd', 'efgh'), 0.1) + + self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual( + self.cmp_j_fuzzy.sim('Colin', 'Coiln'), 0.72222222222 + ) + self.assertAlmostEqual( + self.cmp_j_fuzzy.sim('Coiln', 'Colin'), 0.72222222222 + ) + self.assertAlmostEqual( + self.cmp_j_fuzzy.sim('ATCAACGAGT', 'AACGATTAG'), 0.7857142857142857 + ) + + self.assertAlmostEqual( + Jaccard(intersection_type='fuzzy').sim('synonym', 'antonym'), + 0.3333333333333333, + ) + + def test_linkage_jaccard_sim(self): + """Test abydos.distance.Jaccard.sim (group linkage).""" + # Base cases + self.assertEqual(self.cmp_j_linkage.sim('', ''), 1.0) + self.assertEqual(self.cmp_j_linkage.sim('a', ''), 0.0) + self.assertEqual(self.cmp_j_linkage.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_j_linkage.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_j_linkage.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_j_linkage.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_j_linkage.sim('abcd', 'efgh'), 0.1) + + self.assertAlmostEqual( + self.cmp_j_linkage.sim('Nigel', 'Niall'), 0.4444444444444444 + ) + self.assertAlmostEqual( + self.cmp_j_linkage.sim('Niall', 'Nigel'), 0.4444444444444444 + ) + self.assertAlmostEqual(self.cmp_j_linkage.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_j_linkage.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_j_linkage.sim('ATCAACGAGT', 'AACGATTAG'), + 0.6071428571428571, + ) + + # Base cases + self.assertEqual(self.cmp_j_linkage_int.sim('', ''), 1.0) + self.assertEqual(self.cmp_j_linkage_int.sim('a', ''), 0.0) + self.assertEqual(self.cmp_j_linkage_int.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_j_linkage_int.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_j_linkage_int.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_j_linkage_int.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_j_linkage_int.sim('abcd', 'efgh'), 0.1) + + self.assertAlmostEqual( + self.cmp_j_linkage_int.sim('Nigel', 'Niall'), 0.4444444444444444 + ) + self.assertAlmostEqual( + self.cmp_j_linkage_int.sim('Niall', 'Nigel'), 0.5 + ) + self.assertAlmostEqual( + self.cmp_j_linkage_int.sim('Colin', 'Coiln'), 0.5 + ) + self.assertAlmostEqual( + self.cmp_j_linkage_int.sim('Coiln', 'Colin'), 0.5 + ) + self.assertAlmostEqual( + self.cmp_j_linkage_int.sim('ATCAACGAGT', 'AACGATTAG'), + 0.6428571428571429, + ) + + self.assertAlmostEqual( + Jaccard( + intersection_type='linkage', + metric=JaroWinkler(), + threshold=0.2, + ).sim('synonym', 'antonym'), + 0.5, + ) + + def test_token_distance(self): + """Test abydos.distance._TokenDistance members.""" + self.assertAlmostEqual( + Jaccard(intersection_type='soft', alphabet=24).sim( + 'ATCAACGAGT', 'AACGATTAG' + ), + 0.6071428571428571, + ) + self.assertAlmostEqual( + Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'), + 0.9, + ) + self.assertAlmostEqual( + Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim( + 'ATCAACGAGT', 'AACGATTAG' + ), + 0.6372795969773299, + ) + self.assertAlmostEqual( + Jaccard(alphabet=None).sim('synonym', 'antonym'), + 0.3333333333333333, + ) + self.assertAlmostEqual( + Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'), + 0.34146341463414637, + ) + + src_ctr = Counter({'a': 5, 'b': 2, 'c': 10}) + tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12}) + self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375) + + self.assertAlmostEqual( + SokalMichener(normalizer='proportional').sim('synonym', 'antonym'), + 0.984777917351113, + ) + self.assertAlmostEqual( + SokalMichener(normalizer='log').sim('synonym', 'antonym'), + 1.2385752469545532, + ) + self.assertAlmostEqual( + SokalMichener(normalizer='exp', alphabet=0).sim( + 'synonym', 'antonym' + ), + 3.221246147982545e18, + ) + self.assertAlmostEqual( + SokalMichener(normalizer='laplace').sim('synonym', 'antonym'), + 0.98856416772554, + ) + self.assertAlmostEqual( + SokalMichener(normalizer='inverse').sim('synonym', 'antonym'), + 197.95790155440417, + ) + self.assertAlmostEqual( + SokalMichener(normalizer='complement').sim('synonym', 'antonym'), + 1.0204081632653061, + ) + self.assertAlmostEqual( + SokalMichener(normalizer='base case').sim('synonym', 'antonym'), + 0.9897959183673469, + ) + self.assertAlmostEqual( + SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469 + ) + + sm = SokalMichener() + sm._tokenize('synonym', 'antonym') # noqa: SF01 + + self.assertEqual( + sm._get_tokens(), # noqa: SF01 + ( + Counter( + { + '$s': 1, + 'sy': 1, + 'yn': 1, + 'no': 1, + 'on': 1, + 'ny': 1, + 'ym': 1, + 'm#': 1, + } + ), + Counter( + { + '$a': 1, + 'an': 1, + 'nt': 1, + 'to': 1, + 'on': 1, + 'ny': 1, + 'ym': 1, + 'm#': 1, + } + ), + ), + ) + self.assertEqual(sm._src_card(), 8) # noqa: SF01 + self.assertEqual(sm._tar_card(), 8) # noqa: SF01 + self.assertEqual( + sm._symmetric_difference(), # noqa: SF01 + Counter( + { + '$s': 1, + 'sy': 1, + 'yn': 1, + 'no': 1, + '$a': 1, + 'an': 1, + 'nt': 1, + 'to': 1, + } + ), + ) + self.assertEqual(sm._symmetric_difference_card(), 8) # noqa: SF01 + self.assertEqual(sm._total_complement_card(), 772) # noqa: SF01 + self.assertEqual(sm._population_card(), 788) # noqa: SF01 + self.assertEqual( + sm._union(), # noqa: SF01 + Counter( + { + '$s': 1, + 'sy': 1, + 'yn': 1, + 'no': 1, + 'on': 1, + 'ny': 1, + 'ym': 1, + 'm#': 1, + '$a': 1, + 'an': 1, + 'nt': 1, + 'to': 1, + } + ), + ) + self.assertEqual(sm._union_card(), 12) # noqa: SF01 + self.assertEqual( + sm._difference(), # noqa: SF01 + Counter( + { + '$s': 1, + 'sy': 1, + 'yn': 1, + 'no': 1, + 'on': 0, + 'ny': 0, + 'ym': 0, + 'm#': 0, + '$a': -1, + 'an': -1, + 'nt': -1, + 'to': -1, + } + ), + ) + self.assertEqual( + sm._intersection(), # noqa: SF01 + Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}), + ) + self.assertEqual( + sm._get_confusion_table(), # noqa: SF01 + ConfusionTable(tp=4, tn=772, fp=4, fn=4), + ) + + sm = SokalMichener( + alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1 + ) + sm._tokenize('ATCAACGAGT', 'AACGATTAG') # noqa: SF01 + self.assertEqual(sm._total_complement_card(), 61) # noqa: SF01 + + jac = Jaccard( + intersection_type='linkage', internal_assignment_problem=True + ) + self.assertAlmostEqual( + jac.sim('abandonned', 'abandoned'), 0.954545454545 + ) + self.assertAlmostEqual( + jac.sim('abundacies', 'abundances'), 0.607142857143 + ) + + # Some additional constructors needed to complete test coverage + self.assertAlmostEqual( + Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'), + 0.42857142857142855, + ) + self.assertAlmostEqual( + AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'), + 0.22558922558922556, + ) + self.assertAlmostEqual( + Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim( + 'abc', 'abcd' + ), + 0.42857142857142855, + ) + self.assertAlmostEqual( + Jaccard( + alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer() + ).sim('abc', 'abcd'), + 0.0, + ) + self.assertAlmostEqual( + Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5 + ) + self.assertAlmostEqual( + Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_aline.py b/tests/distance/test_distance_aline.py new file mode 100644 index 000000000..78b02f237 --- /dev/null +++ b/tests/distance/test_distance_aline.py @@ -0,0 +1,595 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_aline. + +This module contains unit tests for abydos.distance.ALINE +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ALINE + + +class ALINETestCases(unittest.TestCase): + """Test ALINE functions. + + abydos.distance.ALINE + """ + + cmp = ALINE() + cmp_downey = ALINE(normalizer=lambda x: sum(x) / len(x)) + + def test_aline_alignment(self): + """Test abydos.distance.ALINE.alignment.""" + # test cases from Kondrak (2000) + self.assertEqual( + self.cmp.alignment('driy', 'tres'), + [(75.0, '‖ d r iy ‖', '‖ t r e ‖ s')], + ) + self.assertEqual( + self.cmp.alignment('blow', 'flare'), + [(53.0, '‖ b l o ‖ w', '‖ f l a ‖ re')], + ) + self.assertEqual( + self.cmp.alignment('ful', 'plenus'), + [(48.0, '‖ f u l ‖', '‖ p - l ‖ enus')], + ) + self.assertEqual( + self.cmp.alignment('fiz', 'piskis'), + [(63.0, '‖ f i z ‖', '‖ p i s ‖ kis')], + ) + self.assertEqual( + self.cmp.alignment('ay', 'ego'), [(17.5, '‖ ay ‖', '‖ e ‖ go')] + ) + self.assertEqual( + self.cmp.alignment('tuwz', 'dentis'), + [(75.0, '‖ t uw z ‖', 'den ‖ t i s ‖')], + ) + + # test cases from Kondrak (2002) after Covington (1996) + # Some of these alignments are a little different from what's in the + # thesis because of the differing encoding used. + self.assertEqual( + self.cmp.alignment('jo', 'zPe'), [(29.0, '‖ j o ‖', '‖ zP e ‖')] + ) + self.assertEqual( + self.cmp.alignment('tu', 'tuF'), [(45.0, '‖ t u ‖', '‖ t uF ‖')] + ) + self.assertEqual( + self.cmp.alignment('nostros', 'nu'), + [(47.5, '‖ n o ‖ stros', '‖ n u ‖')], + ) + self.assertEqual( + self.cmp.alignment('kyen', 'ki'), + [(47.5, '‖ k ye ‖ n', '‖ k i ‖')], + ) + self.assertEqual( + self.cmp.alignment('ke', 'kwa'), [(42.5, '‖ k e ‖', '‖ k wa ‖')] + ) + self.assertEqual( + self.cmp.alignment('todos', 'tu'), + [(47.5, '‖ t o ‖ dos', '‖ t u ‖')], + ) + self.assertEqual( + self.cmp.alignment('una', 'uFn'), + [(45.0, '‖ u n ‖ a', '‖ uF n ‖')], + ) + self.assertEqual( + self.cmp.alignment('dos', 'doF'), + [(45.0, '‖ d o ‖ s', '‖ d oF ‖')], + ) + self.assertEqual( + self.cmp.alignment('tres', 'trwa'), + [(77.5, '‖ t r e ‖ s', '‖ t r wa ‖')], + ) + self.assertEqual( + self.cmp.alignment('ombre', 'om'), + [ + (50.0, '‖ o m ‖ bre', '‖ o m ‖'), + (50.0, '‖ o mb ‖ re', '‖ o m ‖'), + ], + ) + self.assertEqual( + self.cmp.alignment('arbol', 'arbreC'), + [(88.0, '‖ a r b o l ‖', '‖ a r b - r ‖ eC')], + ) + self.assertEqual( + self.cmp.alignment('pluFma', 'plum'), + [(115.0, '‖ p l uF m ‖ a', '‖ p l u m ‖')], + ) + self.assertEqual( + self.cmp.alignment('kabetSa', 'kap'), + [(75.0, '‖ k a b ‖ etSa', '‖ k a p ‖')], + ) + self.assertEqual( + self.cmp.alignment('boka', 'busP'), + [(68.5, '‖ b o k ‖ a', '‖ b u sP ‖')], + ) + self.assertEqual( + self.cmp.alignment('pye', 'pye'), + [(65.0, '‖ p y e ‖', '‖ p y e ‖')], + ) + self.assertEqual( + self.cmp.alignment('koratSon', 'koFr'), + [(80.0, '‖ k o r ‖ atSon', '‖ k oF r ‖')], + ) + self.assertEqual( + self.cmp.alignment('ber', 'vwar'), + [(60.5, '‖ b e r ‖', '‖ v wa r ‖')], + ) + self.assertEqual( + self.cmp.alignment('benir', 'veCnir'), + [(115.5, '‖ b e n i r ‖', '‖ v eC n i r ‖')], + ) + self.assertEqual( + self.cmp.alignment('detSir', 'dir'), + [ + (65.0, 'de ‖ tS i r ‖', '‖ d i r ‖'), + (65.0, '‖ d e tS i r ‖', '‖ d - - i r ‖'), + ], + ) + self.assertEqual( + self.cmp.alignment('pobre', 'povreC'), + [(115.5, '‖ p o b r e ‖', '‖ p o v r eC ‖')], + ) + self.assertEqual( + self.cmp.alignment('dSis', 'diHzes'), + [(77.5, '‖ dS i s ‖', 'diH ‖ z e s ‖')], + ) + self.assertEqual( + self.cmp.alignment('dSaFt', 'das'), + [(62.5, '‖ dS aF t ‖', '‖ d a s ‖')], + ) + # Different from paper: + self.assertEqual( + self.cmp.alignment('wat', 'vas'), + [(40.0, 'w ‖ a t ‖', 'v ‖ a s ‖')], + ) + self.assertEqual( + self.cmp.alignment('nat', 'nixt'), + [ + (62.5, '‖ n a - t ‖', '‖ n i x t ‖'), + (62.5, '‖ n a t ‖', '‖ n i xt ‖'), + ], + ) + self.assertEqual( + self.cmp.alignment('logN', 'lagN'), + [(75.0, '‖ l o gN ‖', '‖ l a gN ‖')], + ) + self.assertEqual( + self.cmp.alignment('maFn', 'man'), + [(82.5, '‖ m aF n ‖', '‖ m a n ‖')], + ) + self.assertEqual( + self.cmp.alignment('flesP', 'flaysP'), + [(122.5, '‖ f l e sP ‖', '‖ f l ay sP ‖')], + ) + self.assertEqual( + self.cmp.alignment('bleCd', 'bluHt'), + [(99.0, '‖ b l eC d ‖', '‖ b l uH t ‖')], + ) + self.assertEqual( + self.cmp.alignment('fedSeCr', 'feHdeCr'), + [(124.0, '‖ f e dS eC r ‖', '‖ f eH d eC r ‖')], + ) + self.assertEqual( + self.cmp.alignment('haFr', 'haHr'), + [(81.5, '‖ h aF r ‖', '‖ h aH r ‖')], + ) + self.assertEqual( + self.cmp.alignment('ir', 'oHr'), [(41.5, '‖ i r ‖', '‖ oH r ‖')] + ) + self.assertEqual( + self.cmp.alignment('ay', 'awgeC'), + [(20.0, '‖ a y ‖', '‖ a w ‖ geC')], + ) + self.assertEqual( + self.cmp.alignment('nowz', 'naHzeC'), + [(70.5, '‖ n ow z ‖', '‖ n aH z ‖ eC')], + ) + self.assertEqual( + self.cmp.alignment('mawtS', 'munt'), + [(62.5, '‖ m aw - tS ‖', '‖ m u n t ‖')], + ) + self.assertEqual( + self.cmp.alignment('teCgN', 'tsugNeC'), + [(75.0, '‖ t eC gN ‖', '‖ ts u gN ‖ eC')], + ) + self.assertEqual( + self.cmp.alignment('fut', 'fuHs'), + [(74.0, '‖ f u t ‖', '‖ f uH s ‖')], + ) + self.assertEqual( + self.cmp.alignment('niy', 'kniH'), + [(53.0, '‖ n iy ‖', 'k ‖ n iH ‖')], + ) + self.assertEqual( + self.cmp.alignment('haFnd', 'hant'), + [(107.5, '‖ h aF n d ‖', '‖ h a n t ‖')], + ) + self.assertEqual( + self.cmp.alignment('hart', 'herts'), + [ + (115.0, '‖ h a r t ‖', '‖ h e r t ‖ s'), + (115.0, '‖ h a r t ‖', '‖ h e r ts ‖'), + ], + ) + self.assertEqual( + self.cmp.alignment('liveCr', 'leHbeCr'), + [(109.5, '‖ l i v eC r ‖', '‖ l eH b eC r ‖')], + ) + self.assertEqual( + self.cmp.alignment('aFnd', 'ante'), + [(72.5, '‖ aF n d ‖', '‖ a n t ‖ e')], + ) + self.assertEqual( + self.cmp.alignment('aFt', 'ad'), [(37.5, '‖ aF t ‖', '‖ a d ‖')] + ) + self.assertEqual( + self.cmp.alignment('blow', 'flaHre'), + [(52.0, '‖ b l o ‖ w', '‖ f l aH ‖ re')], + ) + # Different from paper: + self.assertEqual( + self.cmp.alignment('ir', 'awris'), + [(45.0, '‖ i r ‖', 'a ‖ w r ‖ is')], + ) + self.assertEqual( + self.cmp.alignment('iyt', 'edere'), + [(40.0, '‖ iy t ‖', '‖ e d ‖ ere')], + ) + self.assertEqual( + self.cmp.alignment('fisS', 'piskis'), + [(73.0, '‖ f i sS ‖', '‖ p i s ‖ kis')], + ) + self.assertEqual( + self.cmp.alignment('flow', 'fluere'), + [(92.5, '‖ f l ow ‖', '‖ f l u ‖ ere')], + ) + self.assertEqual( + self.cmp.alignment('star', 'steHlla'), + [(92.0, '‖ s t a r ‖', '‖ s t eH l ‖ la')], + ) + self.assertEqual( + self.cmp.alignment('ful', 'pleHnus'), + [(48.0, '‖ f u l ‖', '‖ p - l ‖ eHnus')], + ) + self.assertEqual( + self.cmp.alignment('graFs', 'graHmen'), + [(81.5, '‖ g r aF ‖ s', '‖ g r aH ‖ men')], + ) + self.assertEqual( + self.cmp.alignment('hart', 'kordis'), + [(70.0, '‖ h a r t ‖', '‖ k o r d ‖ is')], + ) + self.assertEqual( + self.cmp.alignment('horn', 'kornuH'), + [(90.0, '‖ h o r n ‖', '‖ k o r n ‖ uH')], + ) + self.assertEqual( + self.cmp.alignment('ay', 'ego'), [(17.5, '‖ ay ‖', '‖ e ‖ go')] + ) + self.assertEqual( + self.cmp.alignment('niy', 'genuH'), + [(44.0, '‖ n i ‖ y', 'ge ‖ n uH ‖')], + ) + self.assertEqual( + self.cmp.alignment('meCdSeCr', 'maHter'), + [(109.0, '‖ m eC dS eC r ‖', '‖ m aH t e r ‖')], + ) + self.assertEqual( + self.cmp.alignment('mawnteCn', 'moHns'), + [(105.5, '‖ m aw n t ‖ eCn', '‖ m oH n s ‖')], + ) + # The example below is different from the expected, but + # (73.0, '‖ n ey m ‖', '‖ n oH m ‖ en') is the #2 alignment. + # This is probably due to slightly differing weights/costs/features. + self.assertEqual( + self.cmp.alignment('neym', 'noHmen'), + [(80.5, '‖ n ey m ‖', 'noH ‖ m e n ‖')], + ) + self.assertEqual( + self.cmp.alignment('nyuw', 'nowus'), + [(70.0, '‖ n yu w ‖', '‖ n o wu ‖ s')], + ) + self.assertEqual( + self.cmp.alignment('weCn', 'uHnus'), + [(48.0, '‖ weC n ‖', '‖ uH n ‖ us')], + ) + self.assertEqual( + self.cmp.alignment('rawnd', 'rotundus'), + [(115.0, '‖ r a - w n d ‖', '‖ r o t u n d ‖ us')], + ) + self.assertEqual( + self.cmp.alignment('sow', 'suere'), + [(57.5, '‖ s ow ‖', '‖ s u ‖ ere')], + ) + self.assertEqual( + self.cmp.alignment('sit', 'seHdere'), + [(66.5, '‖ s i t ‖', '‖ s eH d ‖ ere')], + ) + self.assertEqual( + self.cmp.alignment('tSriy', 'treHs'), + [(73.0, '‖ tS r iy ‖', '‖ t r eH ‖ s')], + ) + self.assertEqual( + self.cmp.alignment('tuwtS', 'dentis'), + [(85.0, '‖ t uw tS ‖', 'den ‖ t i s ‖')], + ) + self.assertEqual( + self.cmp.alignment('tSin', 'tenuis'), + [(67.5, '‖ tS i n ‖', '‖ t e n ‖ uis')], + ) + self.assertEqual( + self.cmp.alignment('kiHnwaHwa', 'kenuaq'), + [(105.5, '‖ k iH n w aH ‖ wa', '‖ k e n u a ‖ q')], + ) + self.assertEqual( + self.cmp.alignment('niHna', 'nenah'), + [(91.5, '‖ n iH n a ‖', '‖ n e n a ‖ h')], + ) + self.assertEqual( + self.cmp.alignment('naHpeHwa', 'naHpeHw'), + [(115.0, '‖ n aH p eH w ‖ a', '‖ n aH p eH w ‖')], + ) + self.assertEqual( + self.cmp.alignment('waHpimini', 'waHpemen'), + [(150.0, '‖ w aH p i m i n ‖ i', '‖ w aH p e m e n ‖')], + ) + self.assertEqual( + self.cmp.alignment('nameHsa', 'nameHqs'), + [(125.0, '‖ n a m eH - s ‖ a', '‖ n a m eH q s ‖')], + ) + self.assertEqual( + self.cmp.alignment('okimaHwa', 'okeHmaHw'), + [(121.5, '‖ o k i m aH w ‖ a', '‖ o k eH m aH w ‖')], + ) + self.assertEqual( + self.cmp.alignment('sPiHsPiHpa', 'seHqsep'), + [(97.0, '‖ sP iH - sP iH p ‖ a', '‖ s eH q s e p ‖')], + ) + self.assertEqual( + self.cmp.alignment('ahkohkwa', 'ahkeHh'), + [(124.0, '‖ a h k o h ‖ kwa', '‖ a h k eH h ‖')], + ) + self.assertEqual( + self.cmp.alignment('pemaHtesiweni', 'pemaHtesewen'), + [ + ( + 257.5, + '‖ p e m aH t e s i w e n ‖ i', + '‖ p e m aH t e s e w e n ‖', + ) + ], + ) + self.assertEqual( + self.cmp.alignment('asenya', 'aqsen'), + [(90.0, '‖ a - s e n ‖ ya', '‖ a q s e n ‖')], + ) + self.assertEqual( + self.cmp.alignment('didoHmi', 'doH'), + [(50.0, 'di ‖ d oH ‖ mi', '‖ d oH ‖')], + ) + self.assertEqual( + self.cmp.alignment('tAugateEr', 'toxteCr'), + [(130.0, '‖ tA u g a t e r ‖', '‖ t o x - t eC r ‖')], + ) + self.assertEqual( + self.cmp.alignment('doteCr', 'tAugateEr'), + [(112.5, '‖ d o t eC r ‖', 'tAu ‖ g a t e r ‖')], + ) + self.assertEqual( + self.cmp.alignment('ager', 'azPras'), + [(61.0, '‖ a g e r ‖', '‖ a zP - r ‖ as')], + ) + self.assertEqual( + self.cmp.alignment('bAaraHmi', 'pAero'), + [(74.0, '‖ bA a r aH ‖ mi', '‖ pA e r o ‖')], + ) + self.assertEqual( + self.cmp.alignment('kentum', 'hekaton'), + [ + (111.5, '‖ k e n t u m ‖', 'he ‖ k a - t o n ‖'), + (111.5, '‖ k e nt u m ‖', 'he ‖ k a t o n ‖'), + ], + ) + self.assertEqual( + self.cmp.alignment('kentum', 'sateCm'), + [ + (90.0, '‖ k e n t u m ‖', '‖ s a - t eC m ‖'), + (90.0, '‖ k e nt u m ‖', '‖ s a t eC m ‖'), + ], + ) + + # test cases from Downey, et al. (2008) + self.assertEqual( + self.cmp.alignment('api', 'api'), + [(65.0, '‖ a p i ‖', '‖ a p i ‖')], + ) + self.assertEqual( + self.cmp.alignment('apik', 'apik'), + [(100.0, '‖ a p i k ‖', '‖ a p i k ‖')], + ) + self.assertEqual( + self.cmp.alignment('apila', 'apila'), + [(115.0, '‖ a p i l a ‖', '‖ a p i l a ‖')], + ) + self.assertEqual( + self.cmp.alignment('api', 'apik'), + [(65.0, '‖ a p i ‖', '‖ a p i ‖ k')], + ) + self.assertEqual( + self.cmp.alignment('api', 'apila'), + [(65.0, '‖ a p i ‖', '‖ a p i ‖ la')], + ) + self.assertEqual( + self.cmp.alignment('apik', 'apila'), + [(65.0, '‖ a p i ‖ k', '‖ a p i ‖ la')], + ) + self.assertEqual( + self.cmp.alignment('kalarita', 'kalarita'), + [(200.0, '‖ k a l a r i t a ‖', '‖ k a l a r i t a ‖')], + ) + self.assertEqual( + self.cmp.alignment('kalara', 'kalara'), + [(150.0, '‖ k a l a r a ‖', '‖ k a l a r a ‖')], + ) + self.assertEqual( + self.cmp.alignment('makebela', 'makebela'), + [(200.0, '‖ m a k e b e l a ‖', '‖ m a k e b e l a ‖')], + ) + # The following case has a different score, but the same alignment as + # in Downey, et. al (2008) + self.assertEqual( + self.cmp.alignment('kalarita', 'kalara'), + [(137.5, '‖ k a l a r i ‖ ta', '‖ k a l a r a ‖')], + ) + self.assertEqual( + self.cmp.alignment('kalarita', 'makebela'), + [ + (75.0, '‖ k - - a l a ‖ rita', 'ma ‖ k e b e l a ‖'), + (75.0, '‖ k a - - l a ‖ rita', 'ma ‖ k e b e l a ‖'), + ], + ) + self.assertEqual( + self.cmp.alignment('kalara', 'makebela'), + [(82.0, '‖ k a l a r a ‖', 'ma ‖ k e b e l a ‖')], + ) + + # other alignment styles: + cmp2 = ALINE(mode='local') + self.assertEqual( + cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), + [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], + ) + cmp2 = ALINE(mode='semi-global') + self.assertEqual( + cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), + [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], + ) + cmp2 = ALINE(mode='half-local') + self.assertEqual( + cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), + [(110.0, '‖ aH p a k o s iH s - ‖', 'w ‖ aH p i k o n oH h a ‖')], + ) + cmp2 = ALINE(mode='global') + self.assertEqual( + cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), + [(106.5, '‖ aH p a k o s iH s - ‖', '‖ waH p i k o n oH h a ‖')], + ) + # The following just confirms that unknown values of mode use 'local' + cmp2 = ALINE(mode='universal') + self.assertEqual( + cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), + [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], + ) + self.assertEqual( + cmp2.alignment('kan', 'kaABCDHn'), + [(84.0, '‖ k a n ‖', '‖ k aABCDH n ‖')], + ) + self.assertEqual( + cmp2.alignment('kaABCDHn', 'kan'), + [(84.0, '‖ k aABCDH n ‖', '‖ k a n ‖')], + ) + cmp2 = ALINE(phones='ipa') + self.assertEqual( + cmp2.alignment('kɒgneit', 'kognaːtus'), + [(163.0, '‖ k ɒ g n ei t ‖', '‖ k o g n aː t ‖ us')], + ) + + def test_aline_sim(self): + """Test abydos.distance.ALINE.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.425) + + self.assertAlmostEqual(self.cmp.sim('nigel', 'niall'), 0.7037037037) + self.assertAlmostEqual(self.cmp.sim('niall', 'nigel'), 0.7037037037) + self.assertAlmostEqual(self.cmp.sim('colin', 'coiln'), 0.8333333333) + self.assertAlmostEqual(self.cmp.sim('coiln', 'colin'), 0.8333333333) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT'.lower(), 'AACGATTAG'.lower()), + 0.685185185, + ) + + # test cases from Downey, et al. (2008) + self.assertAlmostEqual(self.cmp_downey.sim('api', 'api'), 1.0) + self.assertAlmostEqual(self.cmp_downey.sim('apik', 'apik'), 1.0) + self.assertAlmostEqual(self.cmp_downey.sim('apila', 'apila'), 1.0) + self.assertAlmostEqual( + self.cmp_downey.sim('api', 'apik'), 0.7878787879 + ) + self.assertAlmostEqual( + self.cmp_downey.sim('api', 'apila'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_downey.sim('apik', 'apila'), 0.6046511628 + ) + self.assertAlmostEqual( + self.cmp_downey.sim('kalarita', 'kalarita'), 1.0 + ) + self.assertAlmostEqual(self.cmp_downey.sim('kalara', 'kalara'), 1.0) + self.assertAlmostEqual( + self.cmp_downey.sim('makebela', 'makebela'), 1.0 + ) + self.assertAlmostEqual( + self.cmp_downey.sim('kalarita', 'kalara'), 0.785714286 + ) + self.assertAlmostEqual( + self.cmp_downey.sim('kalarita', 'makebela'), 0.375 + ) + self.assertAlmostEqual( + self.cmp_downey.sim('kalara', 'makebela'), 0.468571429 + ) + + def test_aline_sim_score(self): + """Test abydos.distance.ALINE.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 1.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 85.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 51.0) + + self.assertAlmostEqual(self.cmp.sim_score('nigel', 'niall'), 95.0) + self.assertAlmostEqual(self.cmp.sim_score('niall', 'nigel'), 95.0) + self.assertAlmostEqual(self.cmp.sim_score('colin', 'coiln'), 112.5) + self.assertAlmostEqual(self.cmp.sim_score('coiln', 'colin'), 112.5) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT'.lower(), 'AACGATTAG'.lower()), + 185.0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_ample.py b/tests/distance/test_distance_ample.py new file mode 100644 index 000000000..a0b191d97 --- /dev/null +++ b/tests/distance/test_distance_ample.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_ample. + +This module contains unit tests for abydos.distance.AMPLE +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import AMPLE + + +class AMPLETestCases(unittest.TestCase): + """Test AMPLE functions. + + abydos.distance.AMPLE + """ + + cmp = AMPLE() + cmp_no_d = AMPLE(alphabet=0) + cmp_dna = AMPLE(qval=1, alphabet='CGAT') + + def test_ample_sim(self): + """Test abydos.distance.AMPLE.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.002551020408163265) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.00510204081632653) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.006418485237483954) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6324826532 + ) + self.assertAlmostEqual(self.cmp_dna.sim('CGAT', 'CGA'), 0.75) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_anderberg.py b/tests/distance/test_distance_anderberg.py new file mode 100644 index 000000000..c58a19b34 --- /dev/null +++ b/tests/distance/test_distance_anderberg.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_anderberg. + +This module contains unit tests for abydos.distance.Anderberg +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Anderberg + + +class AnderbergTestCases(unittest.TestCase): + """Test Anderberg functions. + + abydos.distance.Anderberg + """ + + cmp = Anderberg() + cmp_no_d = Anderberg(alphabet=0) + cmp_1 = Anderberg(qval=1) + + def test_anderberg_sim(self): + """Test abydos.distance.Anderberg.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.01020408163265306) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0089285714 + ) + self.assertAlmostEqual( + self.cmp_1.sim('abcdefghijklm', 'abcdefghijklm'), 1.0 + ) + self.assertAlmostEqual( + self.cmp_1.sim('abcdefghijklm', 'nopqrstuvwxyz'), 1.0 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_anderberg_sim_score(self): + """Test abydos.distance.Anderberg.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 0.00510204081632653) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.0044642857 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_andres_marzo_delta.py b/tests/distance/test_distance_andres_marzo_delta.py new file mode 100644 index 000000000..c40c348ed --- /dev/null +++ b/tests/distance/test_distance_andres_marzo_delta.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_andres_marzo_delta. + +This module contains unit tests for abydos.distance.AndresMarzoDelta +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import AndresMarzoDelta + + +class AndresMarzoDeltaTestCases(unittest.TestCase): + """Test AndresMarzoDelta functions. + + abydos.distance.AndresMarzoDelta + """ + + cmp = AndresMarzoDelta() + cmp_no_d = AndresMarzoDelta(alphabet=0) + + def test_andres_marzo_delta_sim(self): + """Test abydos.distance.AndresMarzoDelta.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987244897959184) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987244897959184) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974489795918368) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974489795918368) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9872448979591837) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923469388) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9911172173 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5025641703 + ) + + def test_andres_marzo_delta_corr(self): + """Test abydos.distance.AndresMarzoDelta.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.corr('', 'a'), 0.9974489795918368) + self.assertEqual(self.cmp.corr('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.corr('', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), 0.9744897959183674) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9846938776) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9822344347 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.0051283407 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_average_linkage.py b/tests/distance/test_distance_average_linkage.py new file mode 100644 index 000000000..ca359b153 --- /dev/null +++ b/tests/distance/test_distance_average_linkage.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_average_linkage. + +This module contains unit tests for abydos.distance.AverageLinkage +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import AverageLinkage, Prefix +from abydos.tokenizer import QGrams + + +class AverageLinkageTestCases(unittest.TestCase): + """Test AverageLinkage functions. + + abydos.distance.AverageLinkage + """ + + cmp = AverageLinkage() + cmp1 = AverageLinkage(tokenizer=QGrams(1)) + cmp_pfx = AverageLinkage(metric=Prefix()) + + def test_average_linkage_dist(self): + """Test abydos.distance.AverageLinkage.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.75) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.96) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8611111111) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8611111111) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8333333333) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.7545454545 + ) + + self.assertEqual(self.cmp1.dist('aaa', 'aaa'), 0.0) + self.assertAlmostEqual(self.cmp_pfx.dist('ababab', 'ab'), 0.714285714) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_azzoo.py b/tests/distance/test_distance_azzoo.py new file mode 100644 index 000000000..1d736a1c3 --- /dev/null +++ b/tests/distance/test_distance_azzoo.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_azzoo. + +This module contains unit tests for abydos.distance.AZZOO +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import AZZOO + + +class AZZOOTestCases(unittest.TestCase): + """Test AZZOO functions. + + abydos.distance.AZZOO + """ + + cmp = AZZOO() + cmp_no_d = AZZOO(alphabet=0) + + def test_azzoo_sim(self): + """Test abydos.distance.AZZOO.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9949109414758269) + self.assertEqual(self.cmp.sim('', 'a'), 0.9949109414758269) + self.assertEqual(self.cmp.sim('abc', ''), 0.9898477157360406) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9898477157360406) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9809885931558935) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9886075949) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9886075949) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9886075949) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9886075949) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.986163522 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + def test_azzoo_sim_score(self): + """Test abydos.distance.AZZOO.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 392.0) + self.assertEqual(self.cmp.sim_score('a', ''), 391.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 391.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 390.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 390.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 394.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 387.0) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 390.5) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 390.5) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 390.5) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 390.5) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 392.0 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 4.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 3.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 3.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 3.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 3.0) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 7.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_bag.py b/tests/distance/test_distance_bag.py index f42485f2b..6f21f43c8 100644 --- a/tests/distance/test_distance_bag.py +++ b/tests/distance/test_distance_bag.py @@ -31,6 +31,7 @@ import unittest from abydos.distance import Bag, bag, dist_bag, sim_bag +from abydos.tokenizer import SAPSTokenizer class BagTestCases(unittest.TestCase): @@ -72,6 +73,8 @@ def test_bag_sim(self): self.assertEqual(self.cmp.sim('abcdefg', 'hijklm'), 0) self.assertEqual(self.cmp.sim('abcdefg', 'hijklmno'), 0) + self.assertEqual(Bag(tokenizer=SAPSTokenizer()).sim('DNA', 'RNA'), 0.5) + # Test wrapper self.assertAlmostEqual(sim_bag('nelson', 'neilsen'), 5 / 7) diff --git a/tests/distance/test_distance_baroni_urbani_buser_i.py b/tests/distance/test_distance_baroni_urbani_buser_i.py new file mode 100644 index 000000000..0ce2ea9fb --- /dev/null +++ b/tests/distance/test_distance_baroni_urbani_buser_i.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baroni_urbani_buser_i. + +This module contains unit tests for abydos.distance.BaroniUrbaniBuserI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaroniUrbaniBuserI + + +class BaroniUrbaniBuserITestCases(unittest.TestCase): + """Test BaroniUrbaniBuserI functions. + + abydos.distance.BaroniUrbaniBuserI + """ + + cmp = BaroniUrbaniBuserI() + cmp_no_d = BaroniUrbaniBuserI(alphabet=0) + + def test_baroni_urbani_buser_i_sim(self): + """Test abydos.distance.BaroniUrbaniBuserI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.8951383588) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.8951383588) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8951383588) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8951383588) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9199236936 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_baroni_urbani_buser_i_dist(self): + """Test abydos.distance.BaroniUrbaniBuserI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.1048616412) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.1048616412) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1048616412) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1048616412) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0800763064 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baroni_urbani_buser_ii.py b/tests/distance/test_distance_baroni_urbani_buser_ii.py new file mode 100644 index 000000000..f45c7ea3b --- /dev/null +++ b/tests/distance/test_distance_baroni_urbani_buser_ii.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baroni_urbani_buser_ii. + +This module contains unit tests for abydos.distance.BaroniUrbaniBuserII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaroniUrbaniBuserII + + +class BaroniUrbaniBuserIITestCases(unittest.TestCase): + """Test BaroniUrbaniBuserII functions. + + abydos.distance.BaroniUrbaniBuserII + """ + + cmp = BaroniUrbaniBuserII() + cmp_no_d = BaroniUrbaniBuserII(alphabet=0) + + def test_baroni_urbani_buser_ii_sim(self): + """Test abydos.distance.BaroniUrbaniBuserII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.8951383588) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.8951383588) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8951383588) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8951383588) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9199236936 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_baroni_urbani_buser_ii_dist(self): + """Test abydos.distance.BaroniUrbaniBuserII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.1048616412) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.1048616412) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1048616412) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1048616412) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0800763064 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_baroni_urbani_buser_ii_corr(self): + """Test abydos.distance.BaroniUrbaniBuserII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.7902767176) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.7902767176) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.7902767176) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.7902767176) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.8398473871 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_batagelj_bren.py b/tests/distance/test_distance_batagelj_bren.py new file mode 100644 index 000000000..e9fa48d99 --- /dev/null +++ b/tests/distance/test_distance_batagelj_bren.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_batagelj_bren. + +This module contains unit tests for abydos.distance.BatageljBren +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BatageljBren + + +class BatageljBrenTestCases(unittest.TestCase): + """Test BatageljBren functions. + + abydos.distance.BatageljBren + """ + + cmp = BatageljBren() + cmp_no_d = BatageljBren(alphabet=0) + + def test_batagelj_bren_dist(self): + """Test abydos.distance.BatageljBren.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 4.9375e-06) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 4.9375e-06) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 4.9375e-06) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 4.9375e-06) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 2.8397e-06 + ) + + def test_batagelj_bren_sim(self): + """Test abydos.distance.BatageljBren.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9999950625) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9999950625) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9999950625) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9999950625) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9999971603 + ) + + def test_batagelj_bren_dist_abs(self): + """Test abydos.distance.BatageljBren.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), float('inf')) + self.assertEqual(self.cmp.dist_abs('', 'a'), float('inf')) + self.assertEqual(self.cmp.dist_abs('abc', ''), float('inf')) + self.assertEqual(self.cmp.dist_abs('', 'abc'), float('inf')) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), float('inf')) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 0.0038709677 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 0.0038709677 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 0.0038709677 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 0.0038709677 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.0022263451 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_i.py b/tests/distance/test_distance_baulieu_i.py new file mode 100644 index 000000000..0396f9142 --- /dev/null +++ b/tests/distance/test_distance_baulieu_i.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_i. + +This module contains unit tests for abydos.distance.BaulieuI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuI + + +class BaulieuITestCases(unittest.TestCase): + """Test BaulieuI functions. + + abydos.distance.BaulieuI + """ + + cmp = BaulieuI() + cmp_no_d = BaulieuI(alphabet=0) + + def test_baulieu_i_dist(self): + """Test abydos.distance.BaulieuI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5545454545 + ) + + def test_baulieu_i_sim(self): + """Test abydos.distance.BaulieuI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4454545455 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_ii.py b/tests/distance/test_distance_baulieu_ii.py new file mode 100644 index 000000000..53b38d4d7 --- /dev/null +++ b/tests/distance/test_distance_baulieu_ii.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_ii. + +This module contains unit tests for abydos.distance.BaulieuII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuII + + +class BaulieuIITestCases(unittest.TestCase): + """Test BaulieuII functions. + + abydos.distance.BaulieuII + """ + + cmp = BaulieuII() + cmp_no_d = BaulieuII(alphabet=0) + + def test_baulieu_ii_sim(self): + """Test abydos.distance.BaulieuII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2480756967) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2480756967) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2480756967) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2480756967) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4414325876 + ) + + def test_baulieu_ii_dist(self): + """Test abydos.distance.BaulieuII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7519243033) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7519243033) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7519243033) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7519243033) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5585674124 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_iii.py b/tests/distance/test_distance_baulieu_iii.py new file mode 100644 index 000000000..ef7885009 --- /dev/null +++ b/tests/distance/test_distance_baulieu_iii.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_iii. + +This module contains unit tests for abydos.distance.BaulieuIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuIII + + +class BaulieuIIITestCases(unittest.TestCase): + """Test BaulieuIII functions. + + abydos.distance.BaulieuIII + """ + + cmp = BaulieuIII() + cmp_no_d = BaulieuIII(alphabet=0) + + def test_baulieu_iii_dist(self): + """Test abydos.distance.BaulieuIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.48984798000832985) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5000813463140358) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4924640775) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4924640775) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4924640775) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4924640775) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4825007809 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6224489796 + ) + + def test_baulieu_iii_sim(self): + """Test abydos.distance.BaulieuIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5101520199916701) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4999186536859642) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5075359225) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5075359225) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5075359225) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5075359225) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5174992191 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3775510204 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_iv.py b/tests/distance/test_distance_baulieu_iv.py new file mode 100644 index 000000000..83d4b88b8 --- /dev/null +++ b/tests/distance/test_distance_baulieu_iv.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_iv. + +This module contains unit tests for abydos.distance.BaulieuIV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuIV + + +class BaulieuIVTestCases(unittest.TestCase): + """Test BaulieuIV functions. + + abydos.distance.BaulieuIV + """ + + cmp = BaulieuIV() + cmp_no_d = BaulieuIV(alphabet=0) + + def test_baulieu_iv_dist(self): + """Test abydos.distance.BaulieuIV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.4999995930090347) + self.assertEqual(self.cmp.dist('a', ''), 0.4999995950831843) + self.assertEqual(self.cmp.dist('', 'a'), 0.4999995950831843) + self.assertEqual(self.cmp.dist('abc', ''), 0.49999959715204023) + self.assertEqual(self.cmp.dist('', 'abc'), 0.49999959715204023) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.49999637435083444) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.4999996033268451) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4999972161) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4999972161) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4999972161) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4999972161) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4999941112 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5229924650732152) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5229924650732152) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5028740581341519) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5028740581341519) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.5001839397205857 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5001682119 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5001682119 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5001682119 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5001682119 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5000335167 + ) + + def test_baulieu_iv_sim(self): + """Test abydos.distance.BaulieuIV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5000004069909654) + self.assertEqual(self.cmp.sim('a', ''), 0.5000004049168156) + self.assertEqual(self.cmp.sim('', 'a'), 0.5000004049168156) + self.assertEqual(self.cmp.sim('abc', ''), 0.5000004028479598) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5000004028479598) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5000036256491656) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5000003966731549) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5000027839) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5000027839) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5000027839) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5000027839) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5000058888 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.4770075349267848) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.4770075349267848) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.49712594186584813) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.49712594186584813) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.4998160602794143) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4998317881 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4998317881 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4998317881 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4998317881 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4999664833 + ) + + def test_baulieu_iv_dist_abs(self): + """Test abydos.distance.BaulieuIV.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), -1066.2460472130606) + self.assertEqual(self.cmp.dist_abs('a', ''), -1060.8121333300487) + self.assertEqual(self.cmp.dist_abs('', 'a'), -1060.8121333300487) + self.assertEqual(self.cmp.dist_abs('abc', ''), -1055.3920882318764) + self.assertEqual(self.cmp.dist_abs('', 'abc'), -1055.3920882318764) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), -9498.574712454234) + self.assertEqual( + self.cmp.dist_abs('abcd', 'efgh'), -1039.2151656463932 + ) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), -7293.3912640224 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), -7293.3912640224 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), -7293.3912640224 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), -7293.3912640224 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), -15427.7573462754 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist_abs('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist_abs('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist_abs('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist_abs('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist_abs('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist_abs('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_ix.py b/tests/distance/test_distance_baulieu_ix.py new file mode 100644 index 000000000..fde60ef14 --- /dev/null +++ b/tests/distance/test_distance_baulieu_ix.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_ix. + +This module contains unit tests for abydos.distance.BaulieuIX +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuIX + + +class BaulieuIXTestCases(unittest.TestCase): + """Test BaulieuIX functions. + + abydos.distance.BaulieuIX + """ + + cmp = BaulieuIX() + cmp_no_d = BaulieuIX(alphabet=0) + + def test_baulieu_ix_dist(self): + """Test abydos.distance.BaulieuIX.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.002551020408163265) + self.assertEqual(self.cmp.dist('', 'a'), 0.005089058524173028) + self.assertEqual(self.cmp.dist('abc', ''), 0.00510204081632653) + self.assertEqual(self.cmp.dist('', 'abc'), 0.01015228426395939) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.019011406844106463) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0114358323) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0114358323) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0114358323) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0114358323) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0127064803 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5882352941 + ) + + def test_baulieu_ix_sim(self): + """Test abydos.distance.BaulieuIX.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.sim('', 'a'), 0.9949109414758269) + self.assertEqual(self.cmp.sim('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9898477157360406) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9809885931558935) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9885641677) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9885641677) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9885641677) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9885641677) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9872935197 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4117647059 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_v.py b/tests/distance/test_distance_baulieu_v.py new file mode 100644 index 000000000..d28243d4f --- /dev/null +++ b/tests/distance/test_distance_baulieu_v.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_v. + +This module contains unit tests for abydos.distance.BaulieuV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuV + + +class BaulieuVTestCases(unittest.TestCase): + """Test BaulieuV functions. + + abydos.distance.BaulieuV + """ + + cmp = BaulieuV() + cmp_no_d = BaulieuV(alphabet=0) + + def test_baulieu_v_dist(self): + """Test abydos.distance.BaulieuV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.2) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5333333333 + ) + + def test_baulieu_v_sim(self): + """Test abydos.distance.BaulieuV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.8) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4666666667 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_vi.py b/tests/distance/test_distance_baulieu_vi.py new file mode 100644 index 000000000..fb570acfd --- /dev/null +++ b/tests/distance/test_distance_baulieu_vi.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_vi. + +This module contains unit tests for abydos.distance.BaulieuVI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuVI + + +class BaulieuVITestCases(unittest.TestCase): + """Test BaulieuVI functions. + + abydos.distance.BaulieuVI + """ + + cmp = BaulieuVI() + cmp_no_d = BaulieuVI(alphabet=0) + + def test_baulieu_vi_dist(self): + """Test abydos.distance.BaulieuVI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.6666666666666666) + self.assertEqual(self.cmp.dist('', 'a'), 0.6666666666666666) + self.assertEqual(self.cmp.dist('abc', ''), 0.8) + self.assertEqual(self.cmp.dist('', 'abc'), 0.8) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9090909090909091) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4666666667 + ) + + def test_baulieu_vi_sim(self): + """Test abydos.distance.BaulieuVI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.33333333333333337) + self.assertEqual(self.cmp.sim('', 'a'), 0.33333333333333337) + self.assertEqual(self.cmp.sim('abc', ''), 0.19999999999999996) + self.assertEqual(self.cmp.sim('', 'abc'), 0.19999999999999996) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.09090909090909094) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_vii.py b/tests/distance/test_distance_baulieu_vii.py new file mode 100644 index 000000000..ea93de544 --- /dev/null +++ b/tests/distance/test_distance_baulieu_vii.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_vii. + +This module contains unit tests for abydos.distance.BaulieuVII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuVII + + +class BaulieuVIITestCases(unittest.TestCase): + """Test BaulieuVII functions. + + abydos.distance.BaulieuVII + """ + + cmp = BaulieuVII() + cmp_no_d = BaulieuVII(alphabet=0) + + def test_baulieu_vii_dist(self): + """Test abydos.distance.BaulieuVII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.002551020408163265) + self.assertEqual(self.cmp.dist('', 'a'), 0.002551020408163265) + self.assertEqual(self.cmp.dist('abc', ''), 0.00510204081632653) + self.assertEqual(self.cmp.dist('', 'abc'), 0.00510204081632653) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816327) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076238882) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076238882) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076238882) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076238882) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0082644628 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.0909090909 + ) + + def test_baulieu_vii_sim(self): + """Test abydos.distance.BaulieuVII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.sim('', 'a'), 0.9974489795918368) + self.assertEqual(self.cmp.sim('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9872448979591837) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923761118) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923761118) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923761118) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923761118) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9917355372 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.9090909091 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_viii.py b/tests/distance/test_distance_baulieu_viii.py new file mode 100644 index 000000000..69043c435 --- /dev/null +++ b/tests/distance/test_distance_baulieu_viii.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_viii. + +This module contains unit tests for abydos.distance.BaulieuVIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuVIII + + +class BaulieuVIIITestCases(unittest.TestCase): + """Test BaulieuVIII functions. + + abydos.distance.BaulieuVIII + """ + + cmp = BaulieuVIII() + cmp_no_d = BaulieuVIII(alphabet=0) + + def test_baulieu_viii_dist(self): + """Test abydos.distance.BaulieuVIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 6.507705122865473e-06) + self.assertEqual(self.cmp.dist('', 'a'), 6.507705122865473e-06) + self.assertEqual(self.cmp.dist('abc', ''), 2.6030820491461892e-05) + self.assertEqual(self.cmp.dist('', 'abc'), 2.6030820491461892e-05) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 1.6269e-06 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.0051020408 + ) + + def test_baulieu_viii_sim(self): + """Test abydos.distance.BaulieuVIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9999934922948771) + self.assertEqual(self.cmp.sim('', 'a'), 0.9999934922948771) + self.assertEqual(self.cmp.sim('abc', ''), 0.9999739691795085) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9999739691795085) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9999983731 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.9948979592 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_x.py b/tests/distance/test_distance_baulieu_x.py new file mode 100644 index 000000000..fa16b7d3c --- /dev/null +++ b/tests/distance/test_distance_baulieu_x.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_x. + +This module contains unit tests for abydos.distance.BaulieuX +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuX + + +class BaulieuXTestCases(unittest.TestCase): + """Test BaulieuX functions. + + abydos.distance.BaulieuX + """ + + cmp = BaulieuX() + cmp_no_d = BaulieuX(alphabet=0) + + def test_baulieu_x_dist(self): + """Test abydos.distance.BaulieuX.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.005089058524173028) + self.assertEqual(self.cmp.dist('', 'a'), 0.005089058524173028) + self.assertEqual(self.cmp.dist('abc', ''), 0.01015228426395939) + self.assertEqual(self.cmp.dist('', 'abc'), 0.01015228426395939) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.019011406844106463) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0114358323) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0114358323) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0114358323) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0114358323) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0139593909 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6111111111 + ) + + def test_baulieu_x_sim(self): + """Test abydos.distance.BaulieuX.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9949109414758269) + self.assertEqual(self.cmp.sim('', 'a'), 0.9949109414758269) + self.assertEqual(self.cmp.sim('abc', ''), 0.9898477157360406) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9898477157360406) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9809885931558935) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9885641677) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9885641677) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9885641677) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9885641677) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9860406091 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3888888889 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_xi.py b/tests/distance/test_distance_baulieu_xi.py new file mode 100644 index 000000000..e6380e4ad --- /dev/null +++ b/tests/distance/test_distance_baulieu_xi.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_xi. + +This module contains unit tests for abydos.distance.BaulieuXI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuXI +from abydos.tokenizer import QSkipgrams + + +class BaulieuXITestCases(unittest.TestCase): + """Test BaulieuXI functions. + + abydos.distance.BaulieuXI + """ + + cmp = BaulieuXI() + cmp_no_d = BaulieuXI(alphabet=0) + + def test_baulieu_xi_dist(self): + """Test abydos.distance.BaulieuXI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.002551020408163265) + self.assertEqual(self.cmp.dist('', 'a'), 0.002551020408163265) + self.assertEqual(self.cmp.dist('abc', ''), 0.00510204081632653) + self.assertEqual(self.cmp.dist('', 'abc'), 0.00510204081632653) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816327) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076824584) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076824584) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076824584) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076824584) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.009009009 + ) + + self.assertEqual( + BaulieuXI( + alphabet=None, tokenizer=QSkipgrams(qval=2, scaler='SSK') + ).dist('a', 'eh'), + 0.0, + ) + + def test_baulieu_xi_sim(self): + """Test abydos.distance.BaulieuXI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.sim('', 'a'), 0.9974489795918368) + self.assertEqual(self.cmp.sim('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9872448979591837) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923175416) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923175416) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923175416) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923175416) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.990990991 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_xii.py b/tests/distance/test_distance_baulieu_xii.py new file mode 100644 index 000000000..241612cf1 --- /dev/null +++ b/tests/distance/test_distance_baulieu_xii.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_xii. + +This module contains unit tests for abydos.distance.BaulieuXII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuXII + + +class BaulieuXIITestCases(unittest.TestCase): + """Test BaulieuXII functions. + + abydos.distance.BaulieuXII + """ + + cmp = BaulieuXII() + cmp_no_d = BaulieuXII(alphabet=0) + + def test_baulieu_xii_dist(self): + """Test abydos.distance.BaulieuXII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5384615385 + ) + + def test_baulieu_xii_sim(self): + """Test abydos.distance.BaulieuXII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4615384615 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_xiii.py b/tests/distance/test_distance_baulieu_xiii.py new file mode 100644 index 000000000..093c57538 --- /dev/null +++ b/tests/distance/test_distance_baulieu_xiii.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_xiii. + +This module contains unit tests for abydos.distance.BaulieuXIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuXIII + + +class BaulieuXIIITestCases(unittest.TestCase): + """Test BaulieuXIII functions. + + abydos.distance.BaulieuXIII + """ + + cmp = BaulieuXIII() + cmp_no_d = BaulieuXIII(alphabet=0) + + def test_baulieu_xiii_dist(self): + """Test abydos.distance.BaulieuXIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0909090909 + ) + + def test_baulieu_xiii_sim(self): + """Test abydos.distance.BaulieuXIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9090909091 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_xiv.py b/tests/distance/test_distance_baulieu_xiv.py new file mode 100644 index 000000000..8383ebc5a --- /dev/null +++ b/tests/distance/test_distance_baulieu_xiv.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_xiv. + +This module contains unit tests for abydos.distance.BaulieuXIV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuXIV + + +class BaulieuXIVTestCases(unittest.TestCase): + """Test BaulieuXIV functions. + + abydos.distance.BaulieuXIV + """ + + cmp = BaulieuXIV() + cmp_no_d = BaulieuXIV(alphabet=0) + + def test_baulieu_xiv_dist(self): + """Test abydos.distance.BaulieuXIV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5882352941 + ) + + def test_baulieu_xiv_sim(self): + """Test abydos.distance.BaulieuXIV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4117647059 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baulieu_xv.py b/tests/distance/test_distance_baulieu_xv.py new file mode 100644 index 000000000..5ae0c62e8 --- /dev/null +++ b/tests/distance/test_distance_baulieu_xv.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_baulieu_xv. + +This module contains unit tests for abydos.distance.BaulieuXV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BaulieuXV + + +class BaulieuXVTestCases(unittest.TestCase): + """Test BaulieuXV functions. + + abydos.distance.BaulieuXV + """ + + cmp = BaulieuXV() + cmp_no_d = BaulieuXV(alphabet=0) + + def test_baulieu_xv_dist(self): + """Test abydos.distance.BaulieuXV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.6111111111 + ) + + def test_baulieu_xv_sim(self): + """Test abydos.distance.BaulieuXV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3888888889 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_baystat.py b/tests/distance/test_distance_baystat.py index f0e75287b..3cc4d4457 100644 --- a/tests/distance/test_distance_baystat.py +++ b/tests/distance/test_distance_baystat.py @@ -68,7 +68,7 @@ def test_baystat_sim(self): # Tests to maximize coverage self.assertAlmostEqual( - self.cmp.sim('ZIMMERMANN', 'SEMMERMANN', 2, 2, 2), 0.8 + Baystat(2, 2, 2).sim('ZIMMERMANN', 'SEMMERMANN'), 0.8 ) self.assertAlmostEqual(self.cmp.sim('ZIMMER', 'ZIMMERMANN'), 0.6) diff --git a/tests/distance/test_distance_benini_i.py b/tests/distance/test_distance_benini_i.py new file mode 100644 index 000000000..fddaad897 --- /dev/null +++ b/tests/distance/test_distance_benini_i.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_benini_i. + +This module contains unit tests for abydos.distance.BeniniI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BeniniI + + +class BeniniITestCases(unittest.TestCase): + """Test BeniniI functions. + + abydos.distance.BeniniI + """ + + cmp = BeniniI() + cmp_no_d = BeniniI(alphabet=0) + + def test_benini_i_sim(self): + """Test abydos.distance.BeniniI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8478654592 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3 + ) + + def test_benini_i_dist(self): + """Test abydos.distance.BeniniI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1521345408 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7 + ) + + def test_benini_i_corr(self): + """Test abydos.distance.BeniniI.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6957309185 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.4 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_benini_ii.py b/tests/distance/test_distance_benini_ii.py new file mode 100644 index 000000000..636932e6c --- /dev/null +++ b/tests/distance/test_distance_benini_ii.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_benini_ii. + +This module contains unit tests for abydos.distance.BeniniII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BeniniII + + +class BeniniIITestCases(unittest.TestCase): + """Test BeniniII functions. + + abydos.distance.BeniniII + """ + + cmp = BeniniII() + cmp_no_d = BeniniII(alphabet=0) + + def test_benini_ii_sim(self): + """Test abydos.distance.BeniniII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8478654592 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3 + ) + + def test_benini_ii_dist(self): + """Test abydos.distance.BeniniII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1521345408 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7 + ) + + def test_benini_ii_corr(self): + """Test abydos.distance.BeniniII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6957309185 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.4 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_bennet.py b/tests/distance/test_distance_bennet.py new file mode 100644 index 000000000..132e8ac5e --- /dev/null +++ b/tests/distance/test_distance_bennet.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_bennet. + +This module contains unit tests for abydos.distance.Bennet +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Bennet + + +class BennetTestCases(unittest.TestCase): + """Test Bennet functions. + + abydos.distance.Bennet + """ + + cmp = Bennet() + cmp_no_d = Bennet(alphabet=0) + + def test_bennet_sim(self): + """Test abydos.distance.Bennet.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.sim('', 'a'), 0.9974489795918368) + self.assertEqual(self.cmp.sim('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9872448979591837) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923469388) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9910714286 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_bennet_dist(self): + """Test abydos.distance.Bennet.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0025510204081632404) + self.assertEqual(self.cmp.dist('', 'a'), 0.0025510204081632404) + self.assertEqual(self.cmp.dist('abc', ''), 0.005102040816326481) + self.assertEqual(self.cmp.dist('', 'abc'), 0.005102040816326481) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816313) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076530612) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0089285714 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_bennet_corr(self): + """Test abydos.distance.Bennet.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.9948979591836735) + self.assertEqual(self.cmp.corr('', 'a'), 0.9948979591836735) + self.assertEqual(self.cmp.corr('abc', ''), 0.989795918367347) + self.assertEqual(self.cmp.corr('', 'abc'), 0.989795918367347) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), 0.9744897959183674) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9846938776) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9821428571 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_bhattacharyya.py b/tests/distance/test_distance_bhattacharyya.py new file mode 100644 index 000000000..b2245a70d --- /dev/null +++ b/tests/distance/test_distance_bhattacharyya.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_bhattacharyya. + +This module contains unit tests for abydos.distance.Bhattacharyya +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Bhattacharyya + + +class BhattacharyyaTestCases(unittest.TestCase): + """Test Bhattacharyya functions. + + abydos.distance.Bhattacharyya + """ + + cmp = Bhattacharyya() + + def test_bhattacharyya_dist(self): + """Test abydos.distance.Bhattacharyya.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.6674238125 + ) + + def test_bhattacharyya_sim(self): + """Test abydos.distance.Bhattacharyya.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3325761875 + ) + + def test_bhattacharyya_dist_abs(self): + """Test abydos.distance.Bhattacharyya.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), float('-inf')) + self.assertEqual(self.cmp.dist_abs('a', ''), float('-inf')) + self.assertEqual(self.cmp.dist_abs('', 'a'), float('-inf')) + self.assertEqual(self.cmp.dist_abs('abc', ''), float('-inf')) + self.assertEqual(self.cmp.dist_abs('', 'abc'), float('-inf')) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), float('-inf')) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 0.6931471806 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 0.6931471806 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 0.6931471806 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 0.6931471806 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.4043300338 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_bisim.py b/tests/distance/test_distance_bisim.py new file mode 100644 index 000000000..43df97b61 --- /dev/null +++ b/tests/distance/test_distance_bisim.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_bisim. + +This module contains unit tests for abydos.distance.BISIM +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BISIM + + +class BISIMTestCases(unittest.TestCase): + """Test BISIM functions. + + abydos.distance.BISIM + """ + + cmp = BISIM() + cmp3 = BISIM(qval=3) + + def test_bi_sim_sim(self): + """Test abydos.distance.BISIM.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6) + + # test cases from Kondrak and Dorr (2003) + self.assertAlmostEqual(self.cmp.sim('ara', 'ala'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('atara', 'arata'), 0.6) + self.assertAlmostEqual(self.cmp.sim('amaryl', 'amikin'), 0.4166666667) + self.assertAlmostEqual(self.cmp.sim('amaryl', 'altoce'), 0.250) + + # other examples from Kondrak and Dorr (2004) + self.assertAlmostEqual(self.cmp.sim('Zantac', 'Xanax'), 0.4166666667) + self.assertAlmostEqual(self.cmp.sim('Zantac', 'Contac'), 0.5833333333) + self.assertAlmostEqual(self.cmp.sim('Xanax', 'Contac'), 0.25) + self.assertAlmostEqual(self.cmp3.sim('Zantac', 'Xanax'), 0.333333333) + self.assertAlmostEqual(self.cmp3.sim('Zantac', 'Contac'), 0.5) + self.assertAlmostEqual(self.cmp3.sim('Xanax', 'Contac'), 0.166666667) + + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Tramadol'), 0.6875) + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Tobradex'), 0.6250) + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Torecan'), 0.57142857) + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Stadol'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Torsemide'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Theraflu'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Tegretol'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Toradol', 'Taxol'), 0.5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_bleu.py b/tests/distance/test_distance_bleu.py new file mode 100644 index 000000000..c8fd9f781 --- /dev/null +++ b/tests/distance/test_distance_bleu.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_bleu. + +This module contains unit tests for abydos.distance.BLEU +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BLEU +from abydos.tokenizer import QSkipgrams, SAPSTokenizer + + +class BLEUTestCases(unittest.TestCase): + """Test BLEU functions. + + abydos.distance.BLEU + """ + + cmp = BLEU() + cmp_skip_saps = BLEU( + tokenizers=[QSkipgrams(), SAPSTokenizer()], weights=[0.33, 0.67] + ) + + def test_bleu_sim(self): + """Test abydos.distance.BLEU.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6223329773) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6223329773) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7071067812) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7071067812) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5119598032 + ) + + self.assertAlmostEqual( + self.cmp_skip_saps.sim('Nigel', 'Niall'), 0.7828303104 + ) + + def test_bleu_dist(self): + """Test abydos.distance.BLEU.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3776670227) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3776670227) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2928932188) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2928932188) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4880401968 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_block_levenshtein.py b/tests/distance/test_distance_block_levenshtein.py new file mode 100644 index 000000000..7245a6045 --- /dev/null +++ b/tests/distance/test_distance_block_levenshtein.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_block_levenshtein. + +This module contains unit tests for abydos.distance.BlockLevenshtein +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BlockLevenshtein + + +class BlockLevenshteinTestCases(unittest.TestCase): + """Test BlockLevenshtein functions. + + abydos.distance.BlockLevenshtein + """ + + cmp = BlockLevenshtein() + + def test_block_levenshtein_dist(self): + """Test abydos.distance.BlockLevenshtein.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4) + + def test_block_levenshtein_sim(self): + """Test abydos.distance.BlockLevenshtein.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6) + + def test_block_levenshtein_dist_abs(self): + """Test abydos.distance.BlockLevenshtein.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abc', ''), 3) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_brainerd_robinson.py b/tests/distance/test_distance_brainerd_robinson.py new file mode 100644 index 000000000..b3f33fbc0 --- /dev/null +++ b/tests/distance/test_distance_brainerd_robinson.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_brainerd_robinson. + +This module contains unit tests for abydos.distance.BrainerdRobinson +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BrainerdRobinson + + +class BrainerdRobinsonTestCases(unittest.TestCase): + """Test BrainerdRobinson functions. + + abydos.distance.BrainerdRobinson + """ + + cmp = BrainerdRobinson() + + def test_brainerd_robinson_sim(self): + """Test abydos.distance.BrainerdRobinson.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + def test_brainerd_robinson_dist(self): + """Test abydos.distance.BrainerdRobinson.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + def test_brainerd_robinson_sim_score(self): + """Test abydos.distance.BrainerdRobinson.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 200.0) + self.assertEqual(self.cmp.sim_score('a', ''), 100.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 100.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 100.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 100.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 200.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 100.0) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 100.0) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 100.0) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 100.0) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 127.2727272727 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_braun_blanquet.py b/tests/distance/test_distance_braun_blanquet.py new file mode 100644 index 000000000..21ce02c7f --- /dev/null +++ b/tests/distance/test_distance_braun_blanquet.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_braun_blanquet. + +This module contains unit tests for abydos.distance.BraunBlanquet +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import BraunBlanquet + + +class BraunBlanquetTestCases(unittest.TestCase): + """Test BraunBlanquet functions. + + abydos.distance.BraunBlanquet + """ + + cmp = BraunBlanquet() + cmp_no_d = BraunBlanquet(alphabet=0) + + def test_braun_blanquet_sim(self): + """Test abydos.distance.BraunBlanquet.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + def test_braun_blanquet_dist(self): + """Test abydos.distance.BraunBlanquet.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_canberra.py b/tests/distance/test_distance_canberra.py new file mode 100644 index 000000000..2397c4b1a --- /dev/null +++ b/tests/distance/test_distance_canberra.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_canberra. + +This module contains unit tests for abydos.distance.Canberra +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Canberra + + +class CanberraTestCases(unittest.TestCase): + """Test Canberra functions. + + abydos.distance.Canberra + """ + + cmp = Canberra() + + def test_canberra_dist(self): + """Test abydos.distance.Canberra.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + def test_canberra_sim(self): + """Test abydos.distance.Canberra.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_chebyshev.py b/tests/distance/test_distance_chebyshev.py index 362dff71f..b1bb278ce 100644 --- a/tests/distance/test_distance_chebyshev.py +++ b/tests/distance/test_distance_chebyshev.py @@ -31,7 +31,7 @@ import unittest from abydos.distance import Chebyshev, chebyshev -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -43,6 +43,8 @@ class ChebyshevTestCases(unittest.TestCase): """ cmp = Chebyshev() + cmp_q2 = Chebyshev(tokenizer=QGrams(2)) + cmp_ws = Chebyshev(tokenizer=WhitespaceTokenizer()) def test_chebyshev_dist_abs(self): """Test abydos.distance.Chebyshev.dist_abs.""" @@ -51,25 +53,47 @@ def test_chebyshev_dist_abs(self): self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 1) self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 1) - self.assertEqual(self.cmp.dist_abs('', '', 2), 0) - self.assertEqual(self.cmp.dist_abs('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist_abs('', 'neilsen', 2), 1) - self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen', 2), 1) + self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) + self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 1) + self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 1) # supplied q-gram tests - self.assertEqual(self.cmp.dist_abs(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist_abs(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist_abs(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist_abs(QGrams('nelson'), QGrams('neilsen')), 1 + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, ) # non-q-gram tests - self.assertEqual(self.cmp.dist_abs('', '', 0), 0) - self.assertEqual(self.cmp.dist_abs('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist_abs('', 'the quick', 0), 1) - self.assertAlmostEqual(self.cmp.dist_abs(NONQ_FROM, NONQ_TO, 0), 1) - self.assertAlmostEqual(self.cmp.dist_abs(NONQ_TO, NONQ_FROM, 0), 1) + self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) + self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 1) + self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 1) + self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 1) # Test wrapper self.assertAlmostEqual(chebyshev('nelson', 'neilsen', 2), 1) diff --git a/tests/distance/test_distance_chord.py b/tests/distance/test_distance_chord.py new file mode 100644 index 000000000..77a4c32c2 --- /dev/null +++ b/tests/distance/test_distance_chord.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_chord. + +This module contains unit tests for abydos.distance.Chord +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Chord + + +class ChordTestCases(unittest.TestCase): + """Test Chord functions. + + abydos.distance.Chord + """ + + cmp = Chord() + + def test_chord_dist(self): + """Test abydos.distance.Chord.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.707106781186547) + self.assertEqual(self.cmp.dist('', 'a'), 0.707106781186547) + self.assertEqual(self.cmp.dist('abc', ''), 0.707106781186547) + self.assertEqual(self.cmp.dist('', 'abc'), 0.707106781186547) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7071067812) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7071067812) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7071067812) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7071067812) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5766941889 + ) + + def test_chord_sim(self): + """Test abydos.distance.Chord.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.292893218813453) + self.assertEqual(self.cmp.sim('', 'a'), 0.292893218813453) + self.assertEqual(self.cmp.sim('abc', ''), 0.292893218813453) + self.assertEqual(self.cmp.sim('', 'abc'), 0.292893218813453) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2928932188) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2928932188) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2928932188) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2928932188) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4233058111 + ) + + def test_chord_dist_abs(self): + """Test abydos.distance.Chord.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 1.414213562373095) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.8155687433 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_clement.py b/tests/distance/test_distance_clement.py new file mode 100644 index 000000000..4f5437201 --- /dev/null +++ b/tests/distance/test_distance_clement.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_clement. + +This module contains unit tests for abydos.distance.Clement +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Clement + + +class ClementTestCases(unittest.TestCase): + """Test Clement functions. + + abydos.distance.Clement + """ + + cmp = Clement() + cmp_no_d = Clement(alphabet=0) + + def test_clement_sim(self): + """Test abydos.distance.Clement.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0025510204081632404) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.005102040816326481) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.006336616803332366) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5037970201) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5037970201) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5037970201) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5037970201) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6414112246 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1363636364 + ) + + def test_clement_dist(self): + """Test abydos.distance.Clement.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9936633831966676) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4962029799) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4962029799) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4962029799) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4962029799) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3585887754 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8636363636 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_cohen_kappa.py b/tests/distance/test_distance_cohen_kappa.py new file mode 100644 index 000000000..787a3047e --- /dev/null +++ b/tests/distance/test_distance_cohen_kappa.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_cohen_kappa. + +This module contains unit tests for abydos.distance.CohenKappa +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import CohenKappa + + +class CohenKappaTestCases(unittest.TestCase): + """Test CohenKappa functions. + + abydos.distance.CohenKappa + """ + + cmp = CohenKappa() + cmp_no_d = CohenKappa(alphabet=0) + + def test_cohen_kappa_sim(self): + """Test abydos.distance.CohenKappa.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987228607918263) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987228607918263) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974424552429667) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974424552429667) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.993581514762516) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9954751131 + ) + + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + def test_cohen_kappa_dist(self): + """Test abydos.distance.CohenKappa.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('', 'a'), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('abc', ''), 0.002557544757033292) + self.assertEqual(self.cmp.dist('', 'abc'), 0.002557544757033292) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.006418485237484006) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038560411) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0045248869 + ) + + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_cole.py b/tests/distance/test_distance_cole.py new file mode 100644 index 000000000..33582d770 --- /dev/null +++ b/tests/distance/test_distance_cole.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_cole. + +This module contains unit tests for abydos.distance.Cole +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Cole + + +class ColeTestCases(unittest.TestCase): + """Test Cole functions. + + abydos.distance.Cole + """ + + cmp = Cole() + cmp_no_d = Cole(alphabet=0) + cmp_a16 = Cole(alphabet=16) + + def test_cole_sim(self): + """Test abydos.distance.Cole.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8158327461 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + # cases b & c + self.assertAlmostEqual( + self.cmp_a16.sim('ATCAACGAGT', 'AACGATTAG'), 0.5151515151515151 + ) + self.assertAlmostEqual( + self.cmp_a16.sim('ATCAACGAGT', 'AACGAACGATTAGATTAG'), + 0.2976190476190476, + ) + + def test_cole_dist(self): + """Test abydos.distance.Cole.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1841672539 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_cole_corr(self): + """Test abydos.distance.Cole.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6316654921 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_complete_linkage.py b/tests/distance/test_distance_complete_linkage.py new file mode 100644 index 000000000..07c578917 --- /dev/null +++ b/tests/distance/test_distance_complete_linkage.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_complete_linkage. + +This module contains unit tests for abydos.distance.CompleteLinkage +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import CompleteLinkage, JaroWinkler +from abydos.tokenizer import QGrams + + +class CompleteLinkageTestCases(unittest.TestCase): + """Test CompleteLinkage functions. + + abydos.distance.CompleteLinkage + """ + + cmp = CompleteLinkage() + cmp_q4 = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop='')) + cmp_q4_jw = CompleteLinkage( + tokenizer=QGrams(qval=4, start_stop=''), metric=JaroWinkler() + ) + + def test_complete_linkage_dist(self): + """Test abydos.distance.CompleteLinkage.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 1.0) + + self.assertEqual(self.cmp_q4.dist('AAAT', 'AATT'), 0.25) + self.assertAlmostEqual( + self.cmp_q4_jw.dist('AAAT', 'AATT'), 0.133333333333 + ) + + def test_complete_linkage_sim(self): + """Test abydos.distance.CompleteLinkage.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0) + + def test_complete_linkage_dist_abs(self): + """Test abydos.distance.CompleteLinkage.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), float('-inf')) + self.assertEqual(self.cmp.dist_abs('a', ''), float('-inf')) + self.assertEqual(self.cmp.dist_abs('', 'a'), float('-inf')) + self.assertEqual(self.cmp.dist_abs('abc', ''), float('-inf')) + self.assertEqual(self.cmp.dist_abs('', 'abc'), float('-inf')) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 2) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 2) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_consonni_todeschini_i.py b/tests/distance/test_distance_consonni_todeschini_i.py new file mode 100644 index 000000000..f416fb681 --- /dev/null +++ b/tests/distance/test_distance_consonni_todeschini_i.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_consonni_todeschini_i. + +This module contains unit tests for abydos.distance.ConsonniTodeschiniI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ConsonniTodeschiniI + + +class ConsonniTodeschiniITestCases(unittest.TestCase): + """Test ConsonniTodeschiniI functions. + + abydos.distance.ConsonniTodeschiniI + """ + + cmp = ConsonniTodeschiniI() + cmp_no_d = ConsonniTodeschiniI(alphabet=0) + + def test_consonni_todeschini_i_sim(self): + """Test abydos.distance.ConsonniTodeschiniI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9996172903036489) + self.assertEqual(self.cmp.sim('', 'a'), 0.9996172903036489) + self.assertEqual(self.cmp.sim('abc', ''), 0.9992336018090547) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9992336018090547) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9980766131469967) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9988489295) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9988489295) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9988489295) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9988489295) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9986562228 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.7678740744 + ) + + def test_consonni_todeschini_i_dist(self): + """Test abydos.distance.ConsonniTodeschiniI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0003827096963511245) + self.assertEqual(self.cmp.dist('', 'a'), 0.0003827096963511245) + self.assertEqual(self.cmp.dist('abc', ''), 0.0007663981909452611) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0007663981909452611) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.001923386853003306) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0011510705) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0011510705) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0011510705) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0011510705) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0013437772 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.2321259256 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_consonni_todeschini_ii.py b/tests/distance/test_distance_consonni_todeschini_ii.py new file mode 100644 index 000000000..587fbaf8d --- /dev/null +++ b/tests/distance/test_distance_consonni_todeschini_ii.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_consonni_todeschini_ii. + +This module contains unit tests for abydos.distance.ConsonniTodeschiniII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ConsonniTodeschiniII + + +class ConsonniTodeschiniIITestCases(unittest.TestCase): + """Test ConsonniTodeschiniII functions. + + abydos.distance.ConsonniTodeschiniII + """ + + cmp = ConsonniTodeschiniII() + cmp_no_d = ConsonniTodeschiniII(alphabet=0) + + def test_consonni_todeschini_ii_sim(self): + """Test abydos.distance.ConsonniTodeschiniII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.8351838558230296) + self.assertEqual(self.cmp.sim('', 'a'), 0.8351838558230296) + self.assertEqual(self.cmp.sim('abc', ''), 0.7585487129939101) + self.assertEqual(self.cmp.sim('', 'abc'), 0.7585487129939101) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.640262668568961) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7080704349) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7080704349) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7080704349) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7080704349) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6880377723 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.15490196) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.15490196) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.15490196) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.15490196) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2321259256 + ) + + def test_consonni_todeschini_ii_dist(self): + """Test abydos.distance.ConsonniTodeschiniII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.16481614417697044) + self.assertEqual(self.cmp.dist('', 'a'), 0.16481614417697044) + self.assertEqual(self.cmp.dist('abc', ''), 0.24145128700608987) + self.assertEqual(self.cmp.dist('', 'abc'), 0.24145128700608987) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.35973733143103903) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2919295651) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2919295651) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2919295651) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2919295651) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3119622277 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.84509804 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.84509804 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.84509804 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.84509804 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7678740744 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_consonni_todeschini_iii.py b/tests/distance/test_distance_consonni_todeschini_iii.py new file mode 100644 index 000000000..3223d4ea6 --- /dev/null +++ b/tests/distance/test_distance_consonni_todeschini_iii.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_consonni_todeschini_iii. + +This module contains unit tests for abydos.distance.ConsonniTodeschiniIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ConsonniTodeschiniIII + + +class ConsonniTodeschiniIIITestCases(unittest.TestCase): + """Test ConsonniTodeschiniIII functions. + + abydos.distance.ConsonniTodeschiniIII + """ + + cmp = ConsonniTodeschiniIII() + cmp_no_d = ConsonniTodeschiniIII(alphabet=0) + + def test_consonni_todeschini_iii_sim(self): + """Test abydos.distance.ConsonniTodeschiniIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.24145128700608987) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2079748185) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2079748185) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2079748185) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2079748185) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3119622277 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.6020599913 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.7678740744 + ) + + def test_consonni_todeschini_iii_dist(self): + """Test abydos.distance.ConsonniTodeschiniIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.7585487129939101) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7920251815) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7920251815) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7920251815) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7920251815) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.6880377723 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.3979400087 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.2321259256 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_consonni_todeschini_iv.py b/tests/distance/test_distance_consonni_todeschini_iv.py new file mode 100644 index 000000000..618a5c638 --- /dev/null +++ b/tests/distance/test_distance_consonni_todeschini_iv.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_consonni_todeschini_iv. + +This module contains unit tests for abydos.distance.ConsonniTodeschiniIV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ConsonniTodeschiniIV + + +class ConsonniTodeschiniIVTestCases(unittest.TestCase): + """Test ConsonniTodeschiniIV functions. + + abydos.distance.ConsonniTodeschiniIV + """ + + cmp = ConsonniTodeschiniIV() + cmp_no_d = ConsonniTodeschiniIV(alphabet=0) + + def test_consonni_todeschini_iv_sim(self): + """Test abydos.distance.ConsonniTodeschiniIV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6020599913) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6020599913) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6020599913) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6020599913) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7678740744 + ) + + def test_consonni_todeschini_iv_dist(self): + """Test abydos.distance.ConsonniTodeschiniIV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3979400087) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3979400087) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3979400087) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3979400087) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2321259256 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_consonni_todeschini_v.py b/tests/distance/test_distance_consonni_todeschini_v.py new file mode 100644 index 000000000..85b52d1c0 --- /dev/null +++ b/tests/distance/test_distance_consonni_todeschini_v.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_consonni_todeschini_v. + +This module contains unit tests for abydos.distance.ConsonniTodeschiniV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ConsonniTodeschiniV + + +class ConsonniTodeschiniVTestCases(unittest.TestCase): + """Test ConsonniTodeschiniV functions. + + abydos.distance.ConsonniTodeschiniV + """ + + cmp = ConsonniTodeschiniV() + cmp_no_d = ConsonniTodeschiniV(alphabet=0) + + def test_consonni_todeschini_v_sim(self): + """Test abydos.distance.ConsonniTodeschiniV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.8368594684755613) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.36359270242851005) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7281476435) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7281476435) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7281476435) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7281476435) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7523559381 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.1233121373 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.1233121373 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.1233121373 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.1233121373 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1721709773 + ) + + def test_consonni_todeschini_v_dist(self): + """Test abydos.distance.ConsonniTodeschiniV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.16314053152443875) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.63640729757149) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2718523565) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2718523565) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2718523565) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2718523565) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2476440619 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.8766878627 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.8766878627 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.8766878627 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.8766878627 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8278290227 + ) + + def test_consonni_todeschini_v_corr(self): + """Test abydos.distance.ConsonniTodeschiniV.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.6737189369511224) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.2728145951429799) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4562952871) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4562952871) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4562952871) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4562952871) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.5047118763 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.7533757254 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.7533757254 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.7533757254 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.7533757254 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.6556580454 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_cormode_lz.py b/tests/distance/test_distance_cormode_lz.py new file mode 100644 index 000000000..cb9615fc5 --- /dev/null +++ b/tests/distance/test_distance_cormode_lz.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_cormode_lz. + +This module contains unit tests for abydos.distance.CormodeLZ +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import CormodeLZ + + +class CormodeLZTestCases(unittest.TestCase): + """Test CormodeLZ functions. + + abydos.distance.CormodeLZ + """ + + cmp = CormodeLZ() + + def test_cormode_lz_dist(self): + """Test abydos.distance.CormodeLZ.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4) + + def test_cormode_lz_sim(self): + """Test abydos.distance.CormodeLZ.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6) + + def test_cormode_lz_dist_abs(self): + """Test abydos.distance.CormodeLZ.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 1) + self.assertEqual(self.cmp.dist_abs('a', ''), 2) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abc', ''), 4) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 1) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 1) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 5) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 4) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 4) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 4) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 4) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_cosine.py b/tests/distance/test_distance_cosine.py index d94e32e2e..6f87d7433 100644 --- a/tests/distance/test_distance_cosine.py +++ b/tests/distance/test_distance_cosine.py @@ -32,7 +32,7 @@ import unittest from abydos.distance import Cosine, dist_cosine, sim_cosine -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -44,6 +44,8 @@ class CosineSimilarityTestCases(unittest.TestCase): """ cmp = Cosine() + cmp_q2 = Cosine(tokenizer=QGrams(2)) + cmp_ws = Cosine(tokenizer=WhitespaceTokenizer()) def test_cosine_sim(self): """Test abydos.distance.Cosine.sim.""" @@ -54,33 +56,56 @@ def test_cosine_sim(self): self.cmp.sim('nelson', 'neilsen'), 4 / math.sqrt(7 * 8) ) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual( - self.cmp.sim('nelson', 'neilsen', 2), 4 / math.sqrt(7 * 8) + self.cmp_q2.sim('nelson', 'neilsen'), 4 / math.sqrt(7 * 8) ) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), 4 / math.sqrt(7 * 8), ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual( - self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 4 / math.sqrt(9 * 7) + self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / math.sqrt(9 * 7) ) self.assertAlmostEqual( - self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 4 / math.sqrt(9 * 7) + self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / math.sqrt(9 * 7) ) + self.assertEqual(self.cmp_q2.sim('eh', 'a'), 0.0) + # Test wrapper self.assertAlmostEqual( sim_cosine('nelson', 'neilsen'), 4 / math.sqrt(7 * 8) @@ -95,31 +120,52 @@ def test_cosine_dist(self): self.cmp.dist('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8)) ) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual( - self.cmp.dist('nelson', 'neilsen', 2), 1 - (4 / math.sqrt(7 * 8)) + self.cmp_q2.dist('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8)) ) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), 1 - (4 / math.sqrt(7 * 8)), ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual( - self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 1 - 4 / math.sqrt(9 * 7) + self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 - 4 / math.sqrt(9 * 7) ) self.assertAlmostEqual( - self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 1 - 4 / math.sqrt(9 * 7) + self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 - 4 / math.sqrt(9 * 7) ) # Test wrapper diff --git a/tests/distance/test_distance_covington.py b/tests/distance/test_distance_covington.py new file mode 100644 index 000000000..f3d29e122 --- /dev/null +++ b/tests/distance/test_distance_covington.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_covington. + +This module contains unit tests for abydos.distance.Covington +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Covington + +from six import PY2 + + +class CovingtonTestCases(unittest.TestCase): + """Test Covington functions. + + abydos.distance.Covington + """ + + cmp = Covington() + + def test_covington_dist(self): + """Test abydos.distance.Covington.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.014705882352941176) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.4772727272727273) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2592592593) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2592592593) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2037037037) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2037037037) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3578947368 + ) + + self.assertEqual(self.cmp.dist('bcd', 'bcd'), 0.0) + + def test_covington_sim(self): + """Test abydos.distance.Covington.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.9852941176470589) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5227272727272727) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7407407407) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7407407407) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7962962963) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7962962963) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6421052632 + ) + + self.assertEqual(self.cmp.sim('bcd', 'bcd'), 1.0) + + def test_covington_dist_abs(self): + """Test abydos.distance.Covington.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 50) + self.assertEqual(self.cmp.dist_abs('', 'a'), 50) + self.assertEqual(self.cmp.dist_abs('abc', ''), 130) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 130) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 5) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 210) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 140) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 140) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 110) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 110) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 340 + ) + + def test_covington_alignments(self): + """Test abydos.distance.Covington.alignments.""" + if PY2: # skip tests of alignments on Python 2.7 + return + + self.assertEqual( + repr(self.cmp.alignments('yo', 'ze', top_n=1)[0]), + "Alignment(src='yo', tar='ze', score=130)", + ) + self.assertEqual( + repr(self.cmp.alignments('tres', 'trwa', top_n=1)[0]), + "Alignment(src='tr-es', tar='trwa-', score=130)", + ) + self.assertEqual( + repr(self.cmp.alignments('detir', 'dir', top_n=1)[0]), + "Alignment(src='detir', tar='d--ir', score=95)", + ) + self.assertEqual( + repr(self.cmp.alignments('niy', 'kni', top_n=1)[0]), + "Alignment(src='-niy', tar='kni-', score=105)", + ) + self.assertEqual( + repr(self.cmp.alignments('hart', 'kordis', top_n=1)[0]), + "Alignment(src='hart--', tar='kordis', score=240)", + ) + self.assertEqual( + repr(self.cmp.alignments('niy', 'genu', top_n=1)[0]), + "Alignment(src='--niy', tar='genu-', score=170)", + ) + self.assertEqual( + repr(self.cmp.alignments('namesa', 'namiqs', top_n=1)[0]), + "Alignment(src='name-sa', tar='namiqs-', score=135)", + ) + self.assertEqual( + repr(self.cmp.alignments('kentum', 'satem', top_n=1)[0]), + "Alignment(src='kentum', tar='sa-tem', score=170)", + ) + self.assertEqual( + repr(self.cmp.alignments('kentum', 'hekaton', top_n=1)[0]), + "Alignment(src='--kentum', tar='heka-ton', score=260)", + ) + self.assertEqual( + repr(self.cmp.alignments('doter', 'tugatir', top_n=2)[1]), + "Alignment(src='do--ter', tar='tugatir', score=210)", + ) + self.assertEqual( + repr(self.cmp.alignments('sit', 'sedere', top_n=1)[0]), + "Alignment(src='sit---', tar='sedere', score=220)", + ) + + self.assertEqual( + repr(self.cmp.alignments('doter', 'tugatir', top_n=0)), + "[Alignment(src='--doter', tar='tugatir', score=210), \ +Alignment(src='do--ter', tar='tugatir', score=210), \ +Alignment(src='d--oter', tar='tugatir', score=210)]", + ) + self.assertEqual( + repr(self.cmp.alignments('sit', 'sed')), + "[Alignment(src='sit', tar='sed', score=90), \ +Alignment(src='s-it', tar='sed-', score=200), \ +Alignment(src='sit-', tar='s-ed', score=200), \ +Alignment(src='--sit', tar='sed--', score=240), \ +Alignment(src='sit--', tar='--sed', score=240), \ +Alignment(src='-sit', tar='se-d', score=260), \ +Alignment(src='si-t', tar='-sed', score=260), \ +Alignment(src='-sit', tar='sed-', score=300), \ +Alignment(src='sit-', tar='-sed', score=300)]", + ) + self.assertEqual( + repr(self.cmp.alignments('sit', 'sīt', top_n=1)[0]), + "Alignment(src='sit', tar='sīt', score=10)", + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_damerau_levenshtein.py b/tests/distance/test_distance_damerau_levenshtein.py index 99faeca8f..72bc5b245 100644 --- a/tests/distance/test_distance_damerau_levenshtein.py +++ b/tests/distance/test_distance_damerau_levenshtein.py @@ -45,30 +45,26 @@ class DamerauLevenshteinTestCases(unittest.TestCase): """ cmp = DamerauLevenshtein() + cmp571010 = DamerauLevenshtein(cost=(5, 7, 10, 10)) + cmp1010510 = DamerauLevenshtein(cost=(10, 10, 5, 10)) + cmp55105 = DamerauLevenshtein(cost=(5, 5, 10, 5)) + cmp1010105 = DamerauLevenshtein(cost=(10, 10, 10, 5)) def test_damerau_levenshtein_dist_abs(self): """Test abydos.distance.DamerauLevenshtein.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('CA', 'CA'), 0) self.assertEqual(self.cmp.dist_abs('CA', 'ABC'), 2) - self.assertEqual(self.cmp.dist_abs('', 'b', cost=(5, 7, 10, 10)), 5) - self.assertEqual(self.cmp.dist_abs('a', 'ab', cost=(5, 7, 10, 10)), 5) - self.assertEqual(self.cmp.dist_abs('b', '', cost=(5, 7, 10, 10)), 7) - self.assertEqual(self.cmp.dist_abs('ab', 'a', cost=(5, 7, 10, 10)), 7) - self.assertEqual(self.cmp.dist_abs('a', 'b', cost=(10, 10, 5, 10)), 5) - self.assertEqual( - self.cmp.dist_abs('ac', 'bc', cost=(10, 10, 5, 10)), 5 - ) - self.assertEqual(self.cmp.dist_abs('ab', 'ba', cost=(5, 5, 10, 5)), 5) - self.assertEqual( - self.cmp.dist_abs('abc', 'bac', cost=(5, 5, 10, 5)), 5 - ) - self.assertEqual( - self.cmp.dist_abs('cab', 'cba', cost=(5, 5, 10, 5)), 5 - ) - self.assertRaises( - ValueError, self.cmp.dist_abs, 'ab', 'ba', cost=(10, 10, 10, 5) - ) + self.assertEqual(self.cmp571010.dist_abs('', 'b'), 5) + self.assertEqual(self.cmp571010.dist_abs('a', 'ab'), 5) + self.assertEqual(self.cmp571010.dist_abs('b', ''), 7) + self.assertEqual(self.cmp571010.dist_abs('ab', 'a'), 7) + self.assertEqual(self.cmp1010510.dist_abs('a', 'b'), 5) + self.assertEqual(self.cmp1010510.dist_abs('ac', 'bc'), 5) + self.assertEqual(self.cmp55105.dist_abs('ab', 'ba'), 5) + self.assertEqual(self.cmp55105.dist_abs('abc', 'bac'), 5) + self.assertEqual(self.cmp55105.dist_abs('cab', 'cba'), 5) + self.assertRaises(ValueError, self.cmp1010105.dist_abs, 'ab', 'ba') # Test wrapper self.assertEqual(damerau_levenshtein('CA', 'ABC'), 2) @@ -88,32 +84,16 @@ def test_damerau_dist(self): self.assertAlmostEqual(self.cmp.dist('abbc', 'abc'), 1 / 4) self.assertAlmostEqual(self.cmp.dist('CA', 'ABC'), 2 / 3) - self.assertAlmostEqual(self.cmp.dist('', 'b', cost=(5, 7, 10, 10)), 1) - self.assertAlmostEqual( - self.cmp.dist('a', 'ab', cost=(5, 7, 10, 10)), 1 / 2 - ) - self.assertAlmostEqual(self.cmp.dist('b', '', cost=(5, 7, 10, 10)), 1) - self.assertAlmostEqual( - self.cmp.dist('ab', 'a', cost=(5, 7, 10, 10)), 1 / 2 - ) - self.assertAlmostEqual( - self.cmp.dist('a', 'b', cost=(10, 10, 5, 10)), 1 / 2 - ) - self.assertAlmostEqual( - self.cmp.dist('ac', 'bc', cost=(10, 10, 5, 10)), 1 / 4 - ) - self.assertAlmostEqual( - self.cmp.dist('ab', 'ba', cost=(5, 5, 10, 5)), 1 / 2 - ) - self.assertAlmostEqual( - self.cmp.dist('abc', 'bac', cost=(5, 5, 10, 5)), 1 / 3 - ) - self.assertAlmostEqual( - self.cmp.dist('cab', 'cba', cost=(5, 5, 10, 5)), 1 / 3 - ) - self.assertRaises( - ValueError, self.cmp.dist, 'ab', 'ba', cost=(10, 10, 10, 5) - ) + self.assertAlmostEqual(self.cmp571010.dist('', 'b'), 1) + self.assertAlmostEqual(self.cmp571010.dist('a', 'ab'), 1 / 2) + self.assertAlmostEqual(self.cmp571010.dist('b', ''), 1) + self.assertAlmostEqual(self.cmp571010.dist('ab', 'a'), 1 / 2) + self.assertAlmostEqual(self.cmp1010510.dist('a', 'b'), 1 / 2) + self.assertAlmostEqual(self.cmp1010510.dist('ac', 'bc'), 1 / 4) + self.assertAlmostEqual(self.cmp55105.dist('ab', 'ba'), 1 / 2) + self.assertAlmostEqual(self.cmp55105.dist('abc', 'bac'), 1 / 3) + self.assertAlmostEqual(self.cmp55105.dist('cab', 'cba'), 1 / 3) + self.assertRaises(ValueError, self.cmp1010105.dist, 'ab', 'ba') # Test wrapper self.assertAlmostEqual(dist_damerau('abbc', 'abc'), 1 / 4) @@ -133,32 +113,16 @@ def test_damerau_sim(self): self.assertAlmostEqual(self.cmp.sim('abbc', 'abc'), 3 / 4) self.assertAlmostEqual(self.cmp.sim('CA', 'ABC'), 1 / 3) - self.assertAlmostEqual(self.cmp.sim('', 'b', cost=(5, 7, 10, 10)), 0) - self.assertAlmostEqual( - self.cmp.sim('a', 'ab', cost=(5, 7, 10, 10)), 1 / 2 - ) - self.assertAlmostEqual(self.cmp.sim('b', '', cost=(5, 7, 10, 10)), 0) - self.assertAlmostEqual( - self.cmp.sim('ab', 'a', cost=(5, 7, 10, 10)), 1 / 2 - ) - self.assertAlmostEqual( - self.cmp.sim('a', 'b', cost=(10, 10, 5, 10)), 1 / 2 - ) - self.assertAlmostEqual( - self.cmp.sim('ac', 'bc', cost=(10, 10, 5, 10)), 3 / 4 - ) - self.assertAlmostEqual( - self.cmp.sim('ab', 'ba', cost=(5, 5, 10, 5)), 1 / 2 - ) - self.assertAlmostEqual( - self.cmp.sim('abc', 'bac', cost=(5, 5, 10, 5)), 2 / 3 - ) - self.assertAlmostEqual( - self.cmp.sim('cab', 'cba', cost=(5, 5, 10, 5)), 2 / 3 - ) - self.assertRaises( - ValueError, self.cmp.sim, 'ab', 'ba', cost=(10, 10, 10, 5) - ) + self.assertAlmostEqual(self.cmp571010.sim('', 'b'), 0) + self.assertAlmostEqual(self.cmp571010.sim('a', 'ab'), 1 / 2) + self.assertAlmostEqual(self.cmp571010.sim('b', ''), 0) + self.assertAlmostEqual(self.cmp571010.sim('ab', 'a'), 1 / 2) + self.assertAlmostEqual(self.cmp1010510.sim('a', 'b'), 1 / 2) + self.assertAlmostEqual(self.cmp1010510.sim('ac', 'bc'), 3 / 4) + self.assertAlmostEqual(self.cmp55105.sim('ab', 'ba'), 1 / 2) + self.assertAlmostEqual(self.cmp55105.sim('abc', 'bac'), 2 / 3) + self.assertAlmostEqual(self.cmp55105.sim('cab', 'cba'), 2 / 3) + self.assertRaises(ValueError, self.cmp1010105.sim, 'ab', 'ba') # Test wrapper self.assertAlmostEqual(sim_damerau('abbc', 'abc'), 3 / 4) diff --git a/tests/distance/test_distance_dennis.py b/tests/distance/test_distance_dennis.py new file mode 100644 index 000000000..ff352c586 --- /dev/null +++ b/tests/distance/test_distance_dennis.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_dennis. + +This module contains unit tests for abydos.distance.Dennis +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Dennis + + +class DennisTestCases(unittest.TestCase): + """Test Dennis functions. + + abydos.distance.Dennis + """ + + cmp = Dennis() + cmp_no_d = Dennis(alphabet=0) + + def test_dennis_sim(self): + """Test abydos.distance.Dennis.sim.""" + # Base cases + self.assertAlmostEqual(self.cmp.sim('', ''), 0.3333333333333333) + self.assertAlmostEqual(self.cmp.sim('a', ''), 0.3333333333333333) + self.assertAlmostEqual(self.cmp.sim('', 'a'), 0.3333333333333333) + self.assertAlmostEqual(self.cmp.sim('abc', ''), 0.3333333333333333) + self.assertAlmostEqual(self.cmp.sim('', 'abc'), 0.3333333333333333) + self.assertAlmostEqual(self.cmp.sim('abc', 'abc'), 0.9965986394557826) + self.assertAlmostEqual( + self.cmp.sim('abcd', 'efgh'), 0.32908163265306134 + ) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6615646259) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7693640991 + ) + + # Tests with alphabet=0 (no d factor) + self.assertAlmostEqual(self.cmp_no_d.sim('', ''), 0.3333333333333333) + self.assertAlmostEqual(self.cmp_no_d.sim('a', ''), 0.3333333333333333) + self.assertAlmostEqual(self.cmp_no_d.sim('', 'a'), 0.3333333333333333) + self.assertAlmostEqual( + self.cmp_no_d.sim('abc', ''), 0.3333333333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('', 'abc'), 0.3333333333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('abc', 'abc'), 0.3333333333333333 + ) + self.assertAlmostEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2788497568 + ) + + def test_dennis_dist(self): + """Test abydos.distance.Dennis.dist.""" + # Base cases + self.assertAlmostEqual(self.cmp.dist('', ''), 0.6666666666666667) + self.assertAlmostEqual(self.cmp.dist('a', ''), 0.6666666666666667) + self.assertAlmostEqual(self.cmp.dist('', 'a'), 0.6666666666666667) + self.assertAlmostEqual(self.cmp.dist('abc', ''), 0.6666666666666667) + self.assertAlmostEqual(self.cmp.dist('', 'abc'), 0.6666666666666667) + self.assertAlmostEqual( + self.cmp.dist('abc', 'abc'), 0.003401360544217358 + ) + self.assertAlmostEqual( + self.cmp.dist('abcd', 'efgh'), 0.6709183673469387 + ) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3384353741) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2306359009 + ) + + # Tests with alphabet=0 (no d factor) + self.assertAlmostEqual(self.cmp_no_d.dist('', ''), 0.6666666666666667) + self.assertAlmostEqual(self.cmp_no_d.dist('a', ''), 0.6666666666666667) + self.assertAlmostEqual(self.cmp_no_d.dist('', 'a'), 0.6666666666666667) + self.assertAlmostEqual( + self.cmp_no_d.dist('abc', ''), 0.6666666666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('', 'abc'), 0.6666666666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('abc', 'abc'), 0.6666666666666667 + ) + self.assertAlmostEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7211502432 + ) + + def test_dennis_sim_score(self): + """Test abydos.distance.Dennis.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp.sim_score('abc', 'abc'), 27.85714285714286 + ) + self.assertAlmostEqual( + self.cmp.sim_score('abcd', 'efgh'), -0.17857142857142858 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 13.7857142857 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 13.7857142857 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 13.7857142857 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 13.7857142857 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 18.3132921606 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('abcd', 'efgh'), -1.5811388300841895 + ) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.3057883149 + ) + + def test_dennis_corr(self): + """Test abydos.distance.Dennis.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertAlmostEqual(self.cmp.corr('abc', 'abc'), 0.994897959183674) + self.assertAlmostEqual( + self.cmp.corr('abcd', 'efgh'), -0.006377551020408 + ) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4923469388) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6540461486 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.5) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.0817253648 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_dice.py b/tests/distance/test_distance_dice.py index 86d8d66d8..1415579d9 100644 --- a/tests/distance/test_distance_dice.py +++ b/tests/distance/test_distance_dice.py @@ -31,7 +31,7 @@ import unittest from abydos.distance import Dice, dist_dice, sim_dice -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -43,6 +43,8 @@ class DiceTestCases(unittest.TestCase): """ cmp = Dice() + cmp_q2 = Dice(tokenizer=QGrams(2)) + cmp_ws = Dice(tokenizer=WhitespaceTokenizer()) def test_dice_sim(self): """Test abydos.distance.Dice.sim.""" @@ -51,25 +53,47 @@ def test_dice_sim(self): self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) - self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 8 / 15) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) + self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 8 / 15 + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 8 / 15, ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) - self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 / 2) - self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 / 2) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_dice('nelson', 'neilsen'), 8 / 15) @@ -81,25 +105,47 @@ def test_dice_dist(self): self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) - self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 2), 7 / 15) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) + self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 7 / 15 + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7 / 15, ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) - self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 1 / 2) - self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 1 / 2) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_dice('nelson', 'neilsen'), 7 / 15) diff --git a/tests/distance/test_distance_dice_asymmetric_i.py b/tests/distance/test_distance_dice_asymmetric_i.py new file mode 100644 index 000000000..ed531432d --- /dev/null +++ b/tests/distance/test_distance_dice_asymmetric_i.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_dice_asymmetric_i. + +This module contains unit tests for abydos.distance.DiceAsymmetricI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import DiceAsymmetricI + + +class DiceAsymmetricITestCases(unittest.TestCase): + """Test DiceAsymmetricI functions. + + abydos.distance.DiceAsymmetricI + """ + + cmp = DiceAsymmetricI() + + def test_dice_asymmetric_i_sim(self): + """Test abydos.distance.DiceAsymmetricI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + def test_dice_asymmetric_i_dist(self): + """Test abydos.distance.DiceAsymmetricI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_dice_asymmetric_ii.py b/tests/distance/test_distance_dice_asymmetric_ii.py new file mode 100644 index 000000000..2e6f86196 --- /dev/null +++ b/tests/distance/test_distance_dice_asymmetric_ii.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_dice_asymmetric_ii. + +This module contains unit tests for abydos.distance.DiceAsymmetricII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import DiceAsymmetricII + + +class DiceAsymmetricIITestCases(unittest.TestCase): + """Test DiceAsymmetricII functions. + + abydos.distance.DiceAsymmetricII + """ + + cmp = DiceAsymmetricII() + + def test_dice_asymmetric_ii_sim(self): + """Test abydos.distance.DiceAsymmetricII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7) + + def test_dice_asymmetric_ii_dist(self): + """Test abydos.distance.DiceAsymmetricII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_digby.py b/tests/distance/test_distance_digby.py new file mode 100644 index 000000000..bf0455dab --- /dev/null +++ b/tests/distance/test_distance_digby.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_digby. + +This module contains unit tests for abydos.distance.Digby +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Digby +from abydos.tokenizer import QSkipgrams + + +class DigbyTestCases(unittest.TestCase): + """Test Digby functions. + + abydos.distance.Digby + """ + + cmp = Digby() + cmp_no_d = Digby(alphabet=0) + + def test_digby_sim(self): + """Test abydos.distance.Digby.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9847181266) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9847181266) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9847181266) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9847181266) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9898546788 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_digby_dist(self): + """Test abydos.distance.Digby.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0152818734) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0152818734) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0152818734) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0152818734) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0101453212 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_digby_corr(self): + """Test abydos.distance.Digby.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9694362533) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9694362533) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9694362533) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9694362533) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9797093576 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + self.assertEqual( + Digby(alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK')).corr( + 'a', 'eh' + ), + 0.0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_dispersion.py b/tests/distance/test_distance_dispersion.py new file mode 100644 index 000000000..d26f95748 --- /dev/null +++ b/tests/distance/test_distance_dispersion.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_dispersion. + +This module contains unit tests for abydos.distance.Dispersion +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Dispersion + + +class DispersionTestCases(unittest.TestCase): + """Test Dispersion functions. + + abydos.distance.Dispersion + """ + + cmp = Dispersion() + cmp_no_d = Dispersion(alphabet=0) + + def test_dispersion_sim(self): + """Test abydos.distance.Dispersion.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5025380049979176) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.499979663421491) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5018839806) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5018839806) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5018839806) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5018839806) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5043748048 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.375) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4693877551 + ) + + def test_dispersion_dist(self): + """Test abydos.distance.Dispersion.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.49746199500208244) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.500020336578509) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4981160194) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4981160194) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4981160194) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4981160194) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4956251952 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.625) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5306122449 + ) + + def test_dispersion_corr(self): + """Test abydos.distance.Dispersion.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.005076009995835068) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -4.06731570179092e-05) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0037679613) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0037679613) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0037679613) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0037679613) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.0087496095 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.25) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.0612244898 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_doolittle.py b/tests/distance/test_distance_doolittle.py new file mode 100644 index 000000000..cabbcb07f --- /dev/null +++ b/tests/distance/test_distance_doolittle.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_doolittle. + +This module contains unit tests for abydos.distance.Doolittle +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Doolittle + + +class DoolittleTestCases(unittest.TestCase): + """Test Doolittle functions. + + abydos.distance.Doolittle + """ + + cmp = Doolittle() + cmp_no_d = Doolittle(alphabet=0) + + def test_doolittle_sim(self): + """Test abydos.distance.Doolittle.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 4.1196952743799446e-05) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2461588279) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.439469213 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1090909091 + ) + + def test_doolittle_dist(self): + """Test abydos.distance.Doolittle.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9999588030472562) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7538411721) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.560530787 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8909090909 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_dunning.py b/tests/distance/test_distance_dunning.py new file mode 100644 index 000000000..a61b6c88e --- /dev/null +++ b/tests/distance/test_distance_dunning.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_dunning. + +This module contains unit tests for abydos.distance.Dunning +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Dunning + + +class DunningTestCases(unittest.TestCase): + """Test Dunning functions. + + abydos.distance.Dunning + """ + + cmp = Dunning() + cmp_no_d = Dunning(alphabet=0) + + def test_dunning_sim(self): + """Test abydos.distance.Dunning.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual( + self.cmp.sim('abcd', 'efgh'), 0.0010606026735052122 + ) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3233318396) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3233318396) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3233318396) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3233318396) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.46141433614 + ) + + def test_dunning_dist(self): + """Test abydos.distance.Dunning.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp.dist('abcd', 'efgh'), 0.9989393973264948 + ) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6766681604) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6766681604) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6766681604) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6766681604) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.53858566385 + ) + + def test_dunning_sim_score(self): + """Test abydos.distance.Dunning.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertAlmostEqual(self.cmp.sim_score('abc', 'abc'), 0.0923848802) + self.assertAlmostEqual( + self.cmp.sim_score('abcd', 'efgh'), 0.0001181119 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 0.0419023599 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 0.0419023599 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 0.0419023599 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 0.0419023599 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.098245766 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('abc', 'abc'), 1.44385618977 + ) + self.assertAlmostEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 2.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), 0.5032583348 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), 0.5032583348 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), 0.5032583348 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), 0.5032583348 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.240203516 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_editex.py b/tests/distance/test_distance_editex.py index a901e9b74..79ed30310 100644 --- a/tests/distance/test_distance_editex.py +++ b/tests/distance/test_distance_editex.py @@ -40,6 +40,8 @@ class EditexTestCases(unittest.TestCase): """ cmp = Editex() + cmp_local = Editex(local=True) + cmp_taper = Editex(taper=True) def test_editex_dist_abs(self): """Test abydos.distance.Editex.dist_abs.""" @@ -57,24 +59,29 @@ def test_editex_dist_abs(self): self.assertEqual(self.cmp.dist_abs('neal', 'nihl'), 3) self.assertEqual(self.cmp.dist_abs('nihl', 'neal'), 3) + # Test tapering variant + self.assertAlmostEqual( + self.cmp_taper.dist_abs('nelson', 'neilsen'), 2.7142857143 + ) + # Test wrapper self.assertEqual(editex('niall', 'neal'), 1) def test_editex_dist_abs_local(self): """Test abydos.distance.Editex.dist_abs (local variant).""" - self.assertEqual(self.cmp.dist_abs('', '', local=True), 0) - self.assertEqual(self.cmp.dist_abs('nelson', '', local=True), 12) - self.assertEqual(self.cmp.dist_abs('', 'neilsen', local=True), 14) - self.assertEqual(self.cmp.dist_abs('ab', 'a', local=True), 2) - self.assertEqual(self.cmp.dist_abs('ab', 'c', local=True), 2) - self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen', local=True), 2) - self.assertEqual(self.cmp.dist_abs('neilsen', 'nelson', local=True), 2) - self.assertEqual(self.cmp.dist_abs('niall', 'neal', local=True), 1) - self.assertEqual(self.cmp.dist_abs('neal', 'niall', local=True), 1) - self.assertEqual(self.cmp.dist_abs('niall', 'nihal', local=True), 2) - self.assertEqual(self.cmp.dist_abs('nihal', 'niall', local=True), 2) - self.assertEqual(self.cmp.dist_abs('neal', 'nihl', local=True), 3) - self.assertEqual(self.cmp.dist_abs('nihl', 'neal', local=True), 3) + self.assertEqual(self.cmp_local.dist_abs('', ''), 0) + self.assertEqual(self.cmp_local.dist_abs('nelson', ''), 12) + self.assertEqual(self.cmp_local.dist_abs('', 'neilsen'), 14) + self.assertEqual(self.cmp_local.dist_abs('ab', 'a'), 2) + self.assertEqual(self.cmp_local.dist_abs('ab', 'c'), 2) + self.assertEqual(self.cmp_local.dist_abs('nelson', 'neilsen'), 2) + self.assertEqual(self.cmp_local.dist_abs('neilsen', 'nelson'), 2) + self.assertEqual(self.cmp_local.dist_abs('niall', 'neal'), 1) + self.assertEqual(self.cmp_local.dist_abs('neal', 'niall'), 1) + self.assertEqual(self.cmp_local.dist_abs('niall', 'nihal'), 2) + self.assertEqual(self.cmp_local.dist_abs('nihal', 'niall'), 2) + self.assertEqual(self.cmp_local.dist_abs('neal', 'nihl'), 3) + self.assertEqual(self.cmp_local.dist_abs('nihl', 'neal'), 3) # Test wrapper self.assertEqual(editex('niall', 'neal', local=True), 1) @@ -104,6 +111,11 @@ def test_editex_dist(self): self.assertAlmostEqual(self.cmp.dist('neilsen', 'nelson'), 2 / 14) self.assertEqual(self.cmp.dist('niall', 'neal'), 0.1) + # Test tapering variant + self.assertAlmostEqual( + self.cmp_taper.dist('nelson', 'neilsen'), 0.123376623 + ) + # Test wrapper self.assertEqual(dist_editex('niall', 'neal'), 0.1) diff --git a/tests/distance/test_distance_euclidean.py b/tests/distance/test_distance_euclidean.py index 8d2f1773d..56244e6a1 100644 --- a/tests/distance/test_distance_euclidean.py +++ b/tests/distance/test_distance_euclidean.py @@ -31,7 +31,7 @@ import unittest from abydos.distance import Euclidean, dist_euclidean, euclidean, sim_euclidean -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -43,6 +43,8 @@ class EuclideanTestCases(unittest.TestCase): """ cmp = Euclidean() + cmp_q2 = Euclidean(tokenizer=QGrams(2)) + cmp_ws = Euclidean(tokenizer=WhitespaceTokenizer()) def test_euclidean_dist_abs(self): """Test abydos.distance.Euclidean.dist_abs.""" @@ -53,34 +55,52 @@ def test_euclidean_dist_abs(self): self.cmp.dist_abs('nelson', 'neilsen'), 7 ** 0.5 ) - self.assertEqual(self.cmp.dist_abs('', '', 2), 0) - self.assertEqual(self.cmp.dist_abs('nelson', '', 2), 7 ** 0.5) - self.assertEqual(self.cmp.dist_abs('', 'neilsen', 2), 8 ** 0.5) + self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) + self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7 ** 0.5) + self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8 ** 0.5) self.assertAlmostEqual( - self.cmp.dist_abs('nelson', 'neilsen', 2), 7 ** 0.5 + self.cmp_q2.dist_abs('nelson', 'neilsen'), 7 ** 0.5 ) # supplied q-gram tests - self.assertEqual(self.cmp.dist_abs(QGrams(''), QGrams('')), 0) self.assertEqual( - self.cmp.dist_abs(QGrams('nelson'), QGrams('')), 7 ** 0.5 + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, ) self.assertEqual( - self.cmp.dist_abs(QGrams(''), QGrams('neilsen')), 8 ** 0.5 + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 7 ** 0.5, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 8 ** 0.5, ) self.assertAlmostEqual( - self.cmp.dist_abs(QGrams('nelson'), QGrams('neilsen')), 7 ** 0.5 + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7 ** 0.5, ) # non-q-gram tests - self.assertEqual(self.cmp.dist_abs('', '', 0), 0) - self.assertEqual(self.cmp.dist_abs('the quick', '', 0), 2 ** 0.5) - self.assertEqual(self.cmp.dist_abs('', 'the quick', 0), 2 ** 0.5) + self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) + self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2 ** 0.5) + self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2 ** 0.5) self.assertAlmostEqual( - self.cmp.dist_abs(NONQ_FROM, NONQ_TO, 0), 8 ** 0.5 + self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8 ** 0.5 ) self.assertAlmostEqual( - self.cmp.dist_abs(NONQ_TO, NONQ_FROM, 0), 8 ** 0.5 + self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8 ** 0.5 ) # Test wrapper @@ -95,31 +115,52 @@ def test_euclidean_sim(self): self.cmp.sim('nelson', 'neilsen'), 1 - 7 ** 0.5 / 23 ** 0.5 ) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual( - self.cmp.sim('nelson', 'neilsen', 2), 1 - 7 ** 0.5 / 23 ** 0.5 + self.cmp_q2.sim('nelson', 'neilsen'), 1 - 7 ** 0.5 / 23 ** 0.5 ) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), 1 - 7 ** 0.5 / 23 ** 0.5, ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual( - self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 - 8 ** 0.5 / 24 ** 0.5 + self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 - 8 ** 0.5 / 24 ** 0.5 ) self.assertAlmostEqual( - self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 - 8 ** 0.5 / 24 ** 0.5 + self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 - 8 ** 0.5 / 24 ** 0.5 ) # Test wrapper @@ -136,31 +177,52 @@ def test_euclidean_dist(self): self.cmp.dist('nelson', 'neilsen'), 7 ** 0.5 / 23 ** 0.5 ) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual( - self.cmp.dist('nelson', 'neilsen', 2), 7 ** 0.5 / 23 ** 0.5 + self.cmp_q2.dist('nelson', 'neilsen'), 7 ** 0.5 / 23 ** 0.5 ) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), 7 ** 0.5 / 23 ** 0.5, ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual( - self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 8 ** 0.5 / 24 ** 0.5 + self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 8 ** 0.5 / 24 ** 0.5 ) self.assertAlmostEqual( - self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 8 ** 0.5 / 24 ** 0.5 + self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 8 ** 0.5 / 24 ** 0.5 ) # Test wrapper diff --git a/tests/distance/test_distance_eudex.py b/tests/distance/test_distance_eudex.py index dbf1659f4..7cf82b7fb 100644 --- a/tests/distance/test_distance_eudex.py +++ b/tests/distance/test_distance_eudex.py @@ -50,35 +50,35 @@ def test_eudex_dist_abs(self): """Test abydos.distance.Eudex.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), 0) - self.assertEqual(self.cmp.dist_abs('', '', None), 0) - self.assertEqual(self.cmp.dist_abs('', '', 'fibonacci'), 0) - self.assertEqual(self.cmp.dist_abs('', '', [10, 1, 1, 1]), 0) - self.assertEqual(self.cmp.dist_abs('', '', _yield_1), 0) + self.assertEqual(Eudex(None).dist_abs('', ''), 0) + self.assertEqual(Eudex('fibonacci').dist_abs('', ''), 0) + self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('', ''), 0) + self.assertEqual(Eudex(_yield_1).dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('', '', normalized=True), 0) self.assertEqual(self.cmp.dist_abs('Niall', 'Niall'), 0) - self.assertEqual(self.cmp.dist_abs('Niall', 'Niall', None), 0) - self.assertEqual(self.cmp.dist_abs('Niall', 'Niall', 'fibonacci'), 0) - self.assertEqual(self.cmp.dist_abs('Niall', 'Niall', [10, 1, 1, 1]), 0) - self.assertEqual(self.cmp.dist_abs('Niall', 'Niall', _yield_1), 0) + self.assertEqual(Eudex(None).dist_abs('Niall', 'Niall'), 0) + self.assertEqual(Eudex('fibonacci').dist_abs('Niall', 'Niall'), 0) + self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('Niall', 'Niall'), 0) + self.assertEqual(Eudex(_yield_1).dist_abs('Niall', 'Niall'), 0) self.assertEqual( self.cmp.dist_abs('Niall', 'Niall', normalized=True), 0 ) self.assertEqual(self.cmp.dist_abs('Niall', 'Neil'), 2) - self.assertEqual(self.cmp.dist_abs('Niall', 'Neil', None), 1) - self.assertEqual(self.cmp.dist_abs('Niall', 'Neil', 'fibonacci'), 2) - self.assertEqual(self.cmp.dist_abs('Niall', 'Neil', [10, 1, 1, 1]), 1) - self.assertEqual(self.cmp.dist_abs('Niall', 'Neil', _yield_1), 1) + self.assertEqual(Eudex(None).dist_abs('Niall', 'Neil'), 1) + self.assertEqual(Eudex('fibonacci').dist_abs('Niall', 'Neil'), 2) + self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('Niall', 'Neil'), 1) + self.assertEqual(Eudex(_yield_1).dist_abs('Niall', 'Neil'), 1) self.assertAlmostEqual( self.cmp.dist_abs('Niall', 'Neil', normalized=True), 0.00098039 ) self.assertEqual(self.cmp.dist_abs('Niall', 'Colin'), 524) - self.assertEqual(self.cmp.dist_abs('Niall', 'Colin', None), 10) - self.assertEqual(self.cmp.dist_abs('Niall', 'Colin', 'fibonacci'), 146) - self.assertEqual(self.cmp.dist_abs('Niall', 'Colin', [10, 1, 1, 1]), 6) - self.assertEqual(self.cmp.dist_abs('Niall', 'Colin', _yield_1), 10) + self.assertEqual(Eudex(None).dist_abs('Niall', 'Colin'), 10) + self.assertEqual(Eudex('fibonacci').dist_abs('Niall', 'Colin'), 146) + self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('Niall', 'Colin'), 42) + self.assertEqual(Eudex(_yield_1).dist_abs('Niall', 'Colin'), 10) self.assertAlmostEqual( self.cmp.dist_abs('Niall', 'Colin', normalized=True), 0.25686274 ) @@ -90,29 +90,28 @@ def test_eudex_dist(self): """Test abydos.distance.Eudex.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0) - self.assertEqual(self.cmp.dist('', '', None), 0) - self.assertEqual(self.cmp.dist('', '', 'fibonacci'), 0) + self.assertEqual(Eudex(None).dist('', ''), 0) + self.assertEqual(Eudex('fibonacci').dist('', ''), 0) self.assertEqual(self.cmp.dist('Niall', 'Niall'), 0) - self.assertEqual(self.cmp.dist('Niall', 'Niall', None), 0) - self.assertEqual(self.cmp.dist('Niall', 'Niall', 'fibonacci'), 0) + self.assertEqual(Eudex(None).dist('Niall', 'Niall'), 0) + self.assertEqual(Eudex('fibonacci').dist('Niall', 'Niall'), 0) self.assertAlmostEqual(self.cmp.dist('Niall', 'Neil'), 0.00098039) + self.assertAlmostEqual(Eudex(None).dist('Niall', 'Neil'), 0.11111111) self.assertAlmostEqual( - self.cmp.dist('Niall', 'Neil', None), 0.11111111 - ) - self.assertAlmostEqual( - self.cmp.dist('Niall', 'Neil', 'fibonacci'), 0.00287356 + Eudex('fibonacci').dist('Niall', 'Neil'), 0.00287356 ) self.assertAlmostEqual(self.cmp.dist('Niall', 'Colin'), 0.25686275) + self.assertAlmostEqual(Eudex(None).dist('Niall', 'Colin'), 0.16666667) self.assertAlmostEqual( - self.cmp.dist('Niall', 'Colin', None), 0.16666667 - ) - self.assertAlmostEqual( - self.cmp.dist('Niall', 'Colin', 'fibonacci'), 0.20977011 + Eudex('fibonacci').dist('Niall', 'Colin'), 0.20977011 ) + with self.assertRaises(ValueError): + Eudex('veryLarge').dist_abs('Niall', 'Colin') + # Test wrapper self.assertAlmostEqual( dist_eudex('Niall', 'Neil', 'fibonacci'), 0.00287356 @@ -122,25 +121,23 @@ def test_eudex_sim(self): """Test abydos.distance.Eudex.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1) - self.assertEqual(self.cmp.sim('', '', None), 1) - self.assertEqual(self.cmp.sim('', '', 'fibonacci'), 1) + self.assertEqual(Eudex(None).sim('', ''), 1) + self.assertEqual(Eudex('fibonacci').sim('', ''), 1) self.assertEqual(self.cmp.sim('Niall', 'Niall'), 1) - self.assertEqual(self.cmp.sim('Niall', 'Niall', None), 1) - self.assertEqual(self.cmp.sim('Niall', 'Niall', 'fibonacci'), 1) + self.assertEqual(Eudex(None).sim('Niall', 'Niall'), 1) + self.assertEqual(Eudex('fibonacci').sim('Niall', 'Niall'), 1) self.assertAlmostEqual(self.cmp.sim('Niall', 'Neil'), 0.99901961) - self.assertAlmostEqual(self.cmp.sim('Niall', 'Neil', None), 0.88888889) + self.assertAlmostEqual(Eudex(None).sim('Niall', 'Neil'), 0.88888889) self.assertAlmostEqual( - self.cmp.sim('Niall', 'Neil', 'fibonacci'), 0.99712644 + Eudex('fibonacci').sim('Niall', 'Neil'), 0.99712644 ) self.assertAlmostEqual(self.cmp.sim('Niall', 'Colin'), 0.74313725) + self.assertAlmostEqual(Eudex(None).sim('Niall', 'Colin'), 0.83333333) self.assertAlmostEqual( - self.cmp.sim('Niall', 'Colin', None), 0.83333333 - ) - self.assertAlmostEqual( - self.cmp.sim('Niall', 'Colin', 'fibonacci'), 0.79022989 + Eudex('fibonacci').sim('Niall', 'Colin'), 0.79022989 ) # Test wrapper diff --git a/tests/distance/test_distance_eyraud.py b/tests/distance/test_distance_eyraud.py new file mode 100644 index 000000000..3a45d7328 --- /dev/null +++ b/tests/distance/test_distance_eyraud.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_eyraud. + +This module contains unit tests for abydos.distance.Eyraud +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Eyraud + + +class EyraudTestCases(unittest.TestCase): + """Test Eyraud functions. + + abydos.distance.Eyraud + """ + + cmp = Eyraud() + cmp_no_d = Eyraud(alphabet=0) + + def test_eyraud_sim(self): + """Test abydos.distance.Eyraud.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp.sim('abc', 'abc'), 1.2327416173570019e-06 + ) + self.assertAlmostEqual( + self.cmp.sim('abcd', 'efgh'), 1.6478781097519779e-06 + ) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 1.5144e-06) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 1.5144e-06) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 1.5144e-06) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 1.5144e-06) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 1.565e-06 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.75) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.04) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.078030303 + ) + + def test_eyraud_dist(self): + """Test abydos.distance.Eyraud.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp.dist('abc', 'abc'), 0.9999987672583827) + self.assertAlmostEqual( + self.cmp.dist('abcd', 'efgh'), 0.9999983521218903 + ) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9999984856) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9999984856) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9999984856) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9999984856) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.999998435 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.25) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.96) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.8981481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.8981481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.8981481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.8981481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.921969697 + ) + + def test_eyraud_sim_score(self): + """Test abydos.distance.Eyraud.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp.sim_score('abc', 'abc'), -1.2327416173570019e-06 + ) + self.assertAlmostEqual( + self.cmp.sim_score('abcd', 'efgh'), -1.6478781097519779e-06 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), -1.5144e-06 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), -1.5144e-06 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), -1.5144e-06 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), -1.5144e-06 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), -1.565e-06 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), -0.75) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), -0.04) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), -0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), -0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), -0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), -0.1018518519 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.078030303 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fager_mcgowan.py b/tests/distance/test_distance_fager_mcgowan.py new file mode 100644 index 000000000..d94260628 --- /dev/null +++ b/tests/distance/test_distance_fager_mcgowan.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fager_mcgowan. + +This module contains unit tests for abydos.distance.FagerMcGowan +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import FagerMcGowan + + +class FagerMcGowanTestCases(unittest.TestCase): + """Test FagerMcGowan functions. + + abydos.distance.FagerMcGowan + """ + + cmp = FagerMcGowan() + + def test_fager_mcgowan_sim(self): + """Test abydos.distance.FagerMcGowan.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.75) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2958758548) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2958758548) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2958758548) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2958758548) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5166681402 + ) + + def test_fager_mcgowan_dist(self): + """Test abydos.distance.FagerMcGowan.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.25) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7041241452) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7041241452) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7041241452) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7041241452) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4833318598 + ) + + def test_fager_mcgowan_sim_score(self): + """Test abydos.distance.FagerMcGowan.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 0.75) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), -0.22360679774997896 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.5166681402 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_faith.py b/tests/distance/test_distance_faith.py new file mode 100644 index 000000000..f57990a72 --- /dev/null +++ b/tests/distance/test_distance_faith.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_faith. + +This module contains unit tests for abydos.distance.Faith +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Faith + + +class FaithTestCases(unittest.TestCase): + """Test Faith functions. + + abydos.distance.Faith + """ + + cmp = Faith() + cmp_no_d = Faith(alphabet=0) + + def test_faith_sim(self): + """Test abydos.distance.Faith.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.4987244897959184) + self.assertEqual(self.cmp.sim('', 'a'), 0.4987244897959184) + self.assertEqual(self.cmp.sim('abc', ''), 0.49744897959183676) + self.assertEqual(self.cmp.sim('', 'abc'), 0.49744897959183676) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5025510204081632) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.49362244897959184) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4980867347) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4980867347) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4980867347) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4980867347) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_faith_dist(self): + """Test abydos.distance.Faith.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5012755102040816) + self.assertEqual(self.cmp.dist('', 'a'), 0.5012755102040816) + self.assertEqual(self.cmp.dist('abc', ''), 0.5025510204081632) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5025510204081632) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.49744897959183676) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5063775510204082) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5019132653) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5019132653) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5019132653) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5019132653) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fellegi_sunter.py b/tests/distance/test_distance_fellegi_sunter.py new file mode 100644 index 000000000..9d72be84e --- /dev/null +++ b/tests/distance/test_distance_fellegi_sunter.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fellegi_sunter. + +This module contains unit tests for abydos.distance.FellegiSunter +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import FellegiSunter + + +class FellegiSunterTestCases(unittest.TestCase): + """Test FellegiSunter functions. + + abydos.distance.FellegiSunter + """ + + cmp = FellegiSunter() + cmp_simp = FellegiSunter(simplified=True) + + def test_fellegi_sunter_sim(self): + """Test abydos.distance.FellegiSunter.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.586895558534099) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.270318312) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.270318312) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.270318312) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.270318312) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.2814144756 + ) + + # Simplified variant cases + self.assertEqual(self.cmp_simp.sim('', ''), 0.0) + self.assertEqual(self.cmp_simp.sim('a', ''), 0.0) + self.assertEqual(self.cmp_simp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_simp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_simp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_simp.sim('abc', 'abc'), 0.9241962407465937) + self.assertEqual(self.cmp_simp.sim('abcd', 'efgh'), 0.0) + self.assertAlmostEqual( + self.cmp_simp.sim('Nigel', 'Niall'), 0.2687639203842084 + ) + self.assertAlmostEqual( + self.cmp_simp.sim('Niall', 'Nigel'), 0.2687639203842084 + ) + self.assertAlmostEqual( + self.cmp_simp.sim('Colin', 'Coiln'), 0.2687639203842084 + ) + self.assertAlmostEqual( + self.cmp_simp.sim('Coiln', 'Colin'), 0.2687639203842084 + ) + self.assertAlmostEqual( + self.cmp_simp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5959107950190301 + ) + + def test_fellegi_sunter_dist(self): + """Test abydos.distance.FellegiSunter.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.413104441465901) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.729681688) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.729681688) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.729681688) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.729681688) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.7185855244 + ) + + # Simplified variant cases + self.assertEqual(self.cmp_simp.dist('', ''), 1.0) + self.assertEqual(self.cmp_simp.dist('a', ''), 1.0) + self.assertEqual(self.cmp_simp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_simp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_simp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_simp.dist('abc', 'abc'), 0.07580375925340632) + self.assertEqual(self.cmp_simp.dist('abcd', 'efgh'), 1.0) + self.assertAlmostEqual( + self.cmp_simp.dist('Nigel', 'Niall'), 0.7312360796157916 + ) + self.assertAlmostEqual( + self.cmp_simp.dist('Niall', 'Nigel'), 0.7312360796157916 + ) + self.assertAlmostEqual( + self.cmp_simp.dist('Colin', 'Coiln'), 0.7312360796157916 + ) + self.assertAlmostEqual( + self.cmp_simp.dist('Coiln', 'Colin'), 0.7312360796157916 + ) + self.assertAlmostEqual( + self.cmp_simp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4040892049809699 + ) + + def test_fellegi_sunter_sim_score(self): + """Test abydos.distance.FellegiSunter.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 1.760686675602297) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 1.3515915598 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 1.3515915598 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 1.3515915598 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 1.3515915598 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 2.8141447562 + ) + + # Simplified variant cases + self.assertEqual(self.cmp_simp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_simp.sim_score('a', ''), -0.6931471805599453) + self.assertEqual(self.cmp_simp.sim_score('', 'a'), 0.0) + self.assertEqual( + self.cmp_simp.sim_score('abc', ''), -2.772588722239781 + ) + self.assertEqual(self.cmp_simp.sim_score('', 'abc'), 0.0) + self.assertEqual( + self.cmp_simp.sim_score('abc', 'abc'), 5.545177444479562 + ) + self.assertEqual( + self.cmp_simp.sim_score('abcd', 'efgh'), -4.023594781085251 + ) + self.assertAlmostEqual( + self.cmp_simp.sim_score('Nigel', 'Niall'), 2.6876392038420835 + ) + self.assertAlmostEqual( + self.cmp_simp.sim_score('Niall', 'Nigel'), 2.6876392038420835 + ) + self.assertAlmostEqual( + self.cmp_simp.sim_score('Colin', 'Coiln'), 2.6876392038420835 + ) + self.assertAlmostEqual( + self.cmp_simp.sim_score('Coiln', 'Colin'), 2.6876392038420835 + ) + self.assertAlmostEqual( + self.cmp_simp.sim_score('ATCAACGAGT', 'AACGATTAG'), + 11.322305105361572, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fidelity.py b/tests/distance/test_distance_fidelity.py new file mode 100644 index 000000000..90bca8804 --- /dev/null +++ b/tests/distance/test_distance_fidelity.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fidelity. + +This module contains unit tests for abydos.distance.Fidelity +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Fidelity + + +class FidelityTestCases(unittest.TestCase): + """Test Fidelity functions. + + abydos.distance.Fidelity + """ + + cmp = Fidelity() + + def test_fidelity_sim(self): + """Test abydos.distance.Fidelity.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4454545455 + ) + + def test_fidelity_dist(self): + """Test abydos.distance.Fidelity.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5545454545 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fleiss.py b/tests/distance/test_distance_fleiss.py new file mode 100644 index 000000000..da3d34583 --- /dev/null +++ b/tests/distance/test_distance_fleiss.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fleiss. + +This module contains unit tests for abydos.distance.Fleiss +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Fleiss + + +class FleissTestCases(unittest.TestCase): + """Test Fleiss functions. + + abydos.distance.Fleiss + """ + + cmp = Fleiss() + cmp_no_d = Fleiss(alphabet=0) + + def test_fleiss_sim(self): + """Test abydos.distance.Fleiss.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8318286736 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3340909091 + ) + + def test_fleiss_dist(self): + """Test abydos.distance.Fleiss.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1681713264 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6659090909 + ) + + def test_fleiss_corr(self): + """Test abydos.distance.Fleiss.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6636573473 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3318181818 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fleiss_levin_paik.py b/tests/distance/test_distance_fleiss_levin_paik.py new file mode 100644 index 000000000..9fc27ddec --- /dev/null +++ b/tests/distance/test_distance_fleiss_levin_paik.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fleiss_levin_paik. + +This module contains unit tests for abydos.distance.FleissLevinPaik +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import FleissLevinPaik + + +class FleissLevinPaikTestCases(unittest.TestCase): + """Test FleissLevinPaik functions. + + abydos.distance.FleissLevinPaik + """ + + cmp = FleissLevinPaik() + cmp_no_d = FleissLevinPaik(alphabet=0) + + def test_fleiss_levin_paik_sim(self): + """Test abydos.distance.FleissLevinPaik.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987228607918263) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987228607918263) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974424552429667) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974424552429667) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.993581514762516) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9954751131 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_fleiss_levin_paik_dist(self): + """Test abydos.distance.FleissLevinPaik.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('', 'a'), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('abc', ''), 0.002557544757033292) + self.assertEqual(self.cmp.dist('', 'abc'), 0.002557544757033292) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.006418485237484006) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038560411) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0045248869 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_flexmetric.py b/tests/distance/test_distance_flexmetric.py new file mode 100644 index 000000000..cf982f6b4 --- /dev/null +++ b/tests/distance/test_distance_flexmetric.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_flexmetric. + +This module contains unit tests for abydos.distance.FlexMetric +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import FlexMetric + + +class FlexMetricTestCases(unittest.TestCase): + """Test FlexMetric functions. + + abydos.distance.FlexMetric + """ + + cmp = FlexMetric() + cmp_custom = FlexMetric( + indel_costs=[(set('aeiou'), 0.1), (set('bcdfghjklmnpqrstvwxyz'), 0.9)], + subst_costs=[(set('aeiou'), 0.1), (set('bcdfghjklmnpqrstvwxyz'), 0.9)], + ) + + def test_flexmetric_dist(self): + """Test abydos.distance.FlexMetric.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertAlmostEqual(self.cmp.dist('abc', ''), 0.7999999999999999) + self.assertAlmostEqual(self.cmp.dist('', 'abc'), 0.7999999999999999) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertAlmostEqual(self.cmp.dist('abcd', 'efgh'), 0.925) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.26) + + def test_flexmetric_sim(self): + """Test abydos.distance.FlexMetric.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertAlmostEqual(self.cmp.sim('abc', ''), 0.20000000000000007) + self.assertAlmostEqual(self.cmp.sim('', 'abc'), 0.20000000000000007) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp.sim('abcd', 'efgh'), 0.075) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.74) + + def test_flexmetric_dist_abs(self): + """Test abydos.distance.FlexMetric.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 2.4) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 2.4) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertAlmostEqual( + self.cmp.dist_abs('abcd', 'efgh'), 3.6999999999999997 + ) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 1.5) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 1.5) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2.0) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2.6 + ) + + self.assertAlmostEqual(self.cmp_custom.dist_abs('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_custom.dist_abs('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_custom.dist_abs('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp_custom.dist_abs('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp_custom.dist_abs('ATCAACGAGT', 'AACGATTAG'), 3.7 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_forbes_i.py b/tests/distance/test_distance_forbes_i.py new file mode 100644 index 000000000..c224c15e5 --- /dev/null +++ b/tests/distance/test_distance_forbes_i.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_forbes_i. + +This module contains unit tests for abydos.distance.ForbesI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ForbesI + + +class ForbesITestCases(unittest.TestCase): + """Test ForbesI functions. + + abydos.distance.ForbesI + """ + + cmp = ForbesI() + cmp_no_d = ForbesI(alphabet=0) + + def test_forbes_i_sim(self): + """Test abydos.distance.ForbesI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.8909090909 + ) + + def test_forbes_i_dist(self): + """Test abydos.distance.ForbesI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.1090909091 + ) + + def test_forbes_i_sim_score(self): + """Test abydos.distance.ForbesI.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 196.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 65.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 65.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 65.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 65.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 49.8909090909 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.8909090909 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_forbes_ii.py b/tests/distance/test_distance_forbes_ii.py new file mode 100644 index 000000000..eb4ce4b09 --- /dev/null +++ b/tests/distance/test_distance_forbes_ii.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_forbes_ii. + +This module contains unit tests for abydos.distance.ForbesII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ForbesII + + +class ForbesIITestCases(unittest.TestCase): + """Test ForbesII functions. + + abydos.distance.ForbesII + """ + + cmp = ForbesII() + cmp_no_d = ForbesII(alphabet=0) + + def test_forbes_ii_sim(self): + """Test abydos.distance.ForbesII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8478654592 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3 + ) + + def test_forbes_ii_dist(self): + """Test abydos.distance.ForbesII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1521345408 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7 + ) + + def test_forbes_ii_corr(self): + """Test abydos.distance.ForbesII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6957309185 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.4 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fossum.py b/tests/distance/test_distance_fossum.py new file mode 100644 index 000000000..5deb8f66e --- /dev/null +++ b/tests/distance/test_distance_fossum.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fossum. + +This module contains unit tests for abydos.distance.Fossum +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Fossum + + +class FossumTestCases(unittest.TestCase): + """Test Fossum functions. + + abydos.distance.Fossum + """ + + cmp = Fossum() + cmp_no_d = Fossum(alphabet=0) + + def test_fossum_sim(self): + """Test abydos.distance.Fossum.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.2222222222222222) + self.assertEqual(self.cmp.sim('', 'a'), 0.2222222222222222) + self.assertEqual(self.cmp.sim('abc', ''), 0.08163265306122448) + self.assertEqual(self.cmp.sim('', 'abc'), 0.08163265306122448) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.01234567901234568) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2066115702) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2066115702) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2066115702) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2066115702) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4215419501 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.2222222222222222) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.2222222222222222) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.08163265306122448) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.08163265306122448) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual( + self.cmp_no_d.sim('abcd', 'efgh'), 0.02469135802469136 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3099173554 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3099173554 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3099173554 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3099173554 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5365079365 + ) + + def test_fossum_dist(self): + """Test abydos.distance.Fossum.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.7777777777777778) + self.assertEqual(self.cmp.dist('', 'a'), 0.7777777777777778) + self.assertEqual(self.cmp.dist('abc', ''), 0.9183673469387755) + self.assertEqual(self.cmp.dist('', 'abc'), 0.9183673469387755) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9876543209876543) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7933884298) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7933884298) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7933884298) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7933884298) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5784580499 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.7777777777777778) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.7777777777777778) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.9183673469387755) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.9183673469387755) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.9753086419753086 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6900826446 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6900826446 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6900826446 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6900826446 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.4634920635 + ) + + def test_fossum_sim_score(self): + """Test abydos.distance.Fossum.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 196.0) + self.assertEqual(self.cmp.sim_score('a', ''), 98.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 98.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 49.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 49.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 600.25) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 7.84) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 136.1111111111 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 136.1111111111 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 136.1111111111 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 136.1111111111 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 301.1272727273 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.25) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.25) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.25) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.25) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 3.0625) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 0.1) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), 1.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), 1.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), 1.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), 1.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 5.3772727273 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fuzzywuzzy_partial_string.py b/tests/distance/test_distance_fuzzywuzzy_partial_string.py new file mode 100644 index 000000000..f98e42183 --- /dev/null +++ b/tests/distance/test_distance_fuzzywuzzy_partial_string.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fuzzywuzzy_partial_string. + +This module contains unit tests for abydos.distance.FuzzyWuzzyPartialString +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import FuzzyWuzzyPartialString + + +class FuzzyWuzzyPartialStringTestCases(unittest.TestCase): + """Test FuzzyWuzzyPartialString functions. + + abydos.distance.FuzzyWuzzyPartialString + """ + + cmp = FuzzyWuzzyPartialString() + + def test_fuzzywuzzy_partial_string_sim(self): + """Test abydos.distance.FuzzyWuzzyPartialString.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + # tests from blog + self.assertAlmostEqual( + self.cmp.sim('YANKEES', 'NEW YORK YANKEES'), 1.0 + ) + self.assertAlmostEqual( + self.cmp.sim('NEW YORK METS', 'NEW YORK YANKEES'), + 0.6923076923076923, + ) + self.assertAlmostEqual( + self.cmp.sim( + 'New York Mets vs Atlanta Braves', + 'Atlanta Braves vs New York Mets', + ), + 0.45161290322580644, + ) + + def test_fuzzywuzzy_partial_string_dist(self): + """Test abydos.distance.FuzzyWuzzyPartialString.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fuzzywuzzy_token_set.py b/tests/distance/test_distance_fuzzywuzzy_token_set.py new file mode 100644 index 000000000..6b7ff30b8 --- /dev/null +++ b/tests/distance/test_distance_fuzzywuzzy_token_set.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fuzzywuzzy_token_set. + +This module contains unit tests for abydos.distance.FuzzyWuzzyTokenSet +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import FuzzyWuzzyTokenSet +from abydos.tokenizer import QGrams + + +class FuzzyWuzzyTokenSetTestCases(unittest.TestCase): + """Test FuzzyWuzzyTokenSet functions. + + abydos.distance.FuzzyWuzzyTokenSet + """ + + cmp = FuzzyWuzzyTokenSet() + cmp_q2 = FuzzyWuzzyTokenSet(tokenizer=QGrams(qval=2)) + + def test_fuzzywuzzy_token_set_sim(self): + """Test abydos.distance.FuzzyWuzzyTokenSet.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.3333333333333333) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8333333333) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8333333333) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + # tests from blog + self.assertEqual( + self.cmp.sim( + 'mariners vs angels', + 'los angeles angels of anaheim at seattle mariners', + ), + 0.9411764705882353, + ) + self.assertEqual(self.cmp.sim('Sirhan, Sirhan', 'Sirhan'), 1.0) + + # q2 tokenizer + self.assertAlmostEqual( + self.cmp_q2.sim('ATCAACGAGT', 'AACGATTAG'), 0.84 + ) + self.assertAlmostEqual( + self.cmp_q2.sim('YANKEES', 'NEW YORK YANKEES'), 0.9545454545454546 + ) + self.assertAlmostEqual( + self.cmp_q2.sim('NEW YORK METS', 'NEW YORK YANKEES'), + 0.8450704225352113, + ) + self.assertAlmostEqual( + self.cmp_q2.sim( + 'New York Mets vs Atlanta Braves', + 'Atlanta Braves vs New York Mets', + ), + 0.9782608695652174, + ) + + def test_fuzzywuzzy_token_set_dist(self): + """Test abydos.distance.FuzzyWuzzyTokenSet.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6666666666666667) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1666666667) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1666666667) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_fuzzywuzzy_token_sort.py b/tests/distance/test_distance_fuzzywuzzy_token_sort.py new file mode 100644 index 000000000..ef503e1ce --- /dev/null +++ b/tests/distance/test_distance_fuzzywuzzy_token_sort.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_fuzzywuzzy_token_sort. + +This module contains unit tests for abydos.distance.FuzzyWuzzyTokenSort +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import FuzzyWuzzyTokenSort +from abydos.tokenizer import QGrams + + +class FuzzyWuzzyTokenSortTestCases(unittest.TestCase): + """Test FuzzyWuzzyTokenSort functions. + + abydos.distance.FuzzyWuzzyTokenSort + """ + + cmp = FuzzyWuzzyTokenSort() + cmp_q2 = FuzzyWuzzyTokenSort(tokenizer=QGrams(qval=2)) + + def test_fuzzywuzzy_token_sort_sim(self): + """Test abydos.distance.FuzzyWuzzyTokenSort.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6315789474 + ) + + # tests from blog + self.assertEqual( + self.cmp.sim( + 'New York Mets vs Atlanta Braves', + 'Atlanta Braves vs New York Mets', + ), + 1.0, + ) + + # q2 tokenizer + self.assertAlmostEqual( + self.cmp_q2.sim('ATCAACGAGT', 'AACGATTAG'), 0.8524590163934426 + ) + self.assertAlmostEqual( + self.cmp_q2.sim('YANKEES', 'NEW YORK YANKEES'), 0.6027397260273972 + ) + self.assertAlmostEqual( + self.cmp_q2.sim('NEW YORK METS', 'NEW YORK YANKEES'), + 0.7692307692307693, + ) + self.assertAlmostEqual( + self.cmp_q2.sim( + 'New York Mets vs Atlanta Braves', + 'Atlanta Braves vs New York Mets', + ), + 0.9578947368421052, + ) + + def test_fuzzywuzzy_token_sort_dist(self): + """Test abydos.distance.FuzzyWuzzyTokenSort.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3684210526 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_generalized_fleiss.py b/tests/distance/test_distance_generalized_fleiss.py new file mode 100644 index 000000000..e196b90db --- /dev/null +++ b/tests/distance/test_distance_generalized_fleiss.py @@ -0,0 +1,307 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_generalized_fleiss. + +This module contains unit tests for abydos.distance.GeneralizedFleiss +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GeneralizedFleiss + + +class GeneralizedFleissTestCases(unittest.TestCase): + """Test GeneralizedFleiss functions. + + abydos.distance.GeneralizedFleiss + """ + + cmp = GeneralizedFleiss(marginals='a') + cmp_no_d = GeneralizedFleiss(alphabet=0) + cmp_b = GeneralizedFleiss(marginals='b') + cmp_c = GeneralizedFleiss(marginals='c') + cmp_prop = GeneralizedFleiss(proportional=True) + cmp_quad = GeneralizedFleiss(mean_func='quadratic') + cmp_hero = GeneralizedFleiss(mean_func='heronian') + cmp_ag = GeneralizedFleiss(mean_func='ag') + cmp_gh = GeneralizedFleiss(mean_func='gh') + cmp_agh = GeneralizedFleiss(mean_func='agh') + + def test_generalized_fleiss_sim(self): + """Test abydos.distance.GeneralizedFleiss.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8310964723 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3356164384 + ) + + # marginals b + self.assertEqual(self.cmp_b.sim('abc', 'abc'), 0.5051280702677116) + self.assertEqual(self.cmp_b.sim('abcd', 'efgh'), 0.4999588047443752) + + self.assertAlmostEqual( + self.cmp_b.sim('Nigel', 'Niall'), 0.5038260754642173 + ) + self.assertAlmostEqual( + self.cmp_b.sim('Niall', 'Nigel'), 0.5038260754642173 + ) + self.assertAlmostEqual( + self.cmp_b.sim('Colin', 'Coiln'), 0.5038260754642173 + ) + self.assertAlmostEqual( + self.cmp_b.sim('Coiln', 'Colin'), 0.5038260754642173 + ) + self.assertAlmostEqual( + self.cmp_b.sim('ATCAACGAGT', 'AACGATTAG'), 0.5089871192422611 + ) + + # marginals c + self.assertEqual(self.cmp_c.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_c.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual( + self.cmp_c.sim('Nigel', 'Niall'), 0.7480719794344473 + ) + self.assertAlmostEqual( + self.cmp_c.sim('Niall', 'Nigel'), 0.7480719794344473 + ) + self.assertAlmostEqual( + self.cmp_c.sim('Colin', 'Coiln'), 0.7480719794344473 + ) + self.assertAlmostEqual( + self.cmp_c.sim('Coiln', 'Colin'), 0.7480719794344473 + ) + self.assertAlmostEqual( + self.cmp_c.sim('ATCAACGAGT', 'AACGATTAG'), 0.8310760896330953 + ) + + # proportional + self.assertEqual(self.cmp_prop.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_prop.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual( + self.cmp_prop.sim('Nigel', 'Niall'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_prop.sim('Niall', 'Nigel'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_prop.sim('Colin', 'Coiln'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_prop.sim('Coiln', 'Colin'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_prop.sim('ATCAACGAGT', 'AACGATTAG'), 0.8310964723 + ) + + # quadratic mean + self.assertEqual(self.cmp_quad.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_quad.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual( + self.cmp_quad.sim('Nigel', 'Niall'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_quad.sim('Niall', 'Nigel'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_quad.sim('Colin', 'Coiln'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_quad.sim('Coiln', 'Colin'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_quad.sim('ATCAACGAGT', 'AACGATTAG'), 0.8307317829209393 + ) + + # heronian mean + self.assertEqual(self.cmp_hero.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_hero.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual( + self.cmp_hero.sim('Nigel', 'Niall'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_hero.sim('Niall', 'Nigel'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_hero.sim('Colin', 'Coiln'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_hero.sim('Coiln', 'Colin'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_hero.sim('ATCAACGAGT', 'AACGATTAG'), 0.8312183486929234 + ) + + # ag mean + self.assertEqual(self.cmp_ag.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_ag.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp_ag.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp_ag.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp_ag.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp_ag.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp_ag.sim('ATCAACGAGT', 'AACGATTAG'), 0.8312793457877081 + ) + + # gh mean + self.assertEqual(self.cmp_gh.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_gh.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp_gh.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp_gh.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp_gh.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp_gh.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp_gh.sim('ATCAACGAGT', 'AACGATTAG'), 0.8316454969290444 + ) + + # agh mean + self.assertEqual(self.cmp_agh.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_agh.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual( + self.cmp_agh.sim('Nigel', 'Niall'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_agh.sim('Niall', 'Nigel'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_agh.sim('Colin', 'Coiln'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_agh.sim('Coiln', 'Colin'), 0.7480719794 + ) + self.assertAlmostEqual( + self.cmp_agh.sim('ATCAACGAGT', 'AACGATTAG'), 0.8314623707995847 + ) + + def test_generalized_fleiss_dist(self): + """Test abydos.distance.GeneralizedFleiss.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1689035277 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6643835616 + ) + + def test_generalized_fleiss_corr(self): + """Test abydos.distance.GeneralizedFleiss.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6621929447 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3287671233 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_gilbert.py b/tests/distance/test_distance_gilbert.py new file mode 100644 index 000000000..d08a3ed3b --- /dev/null +++ b/tests/distance/test_distance_gilbert.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_gilbert. + +This module contains unit tests for abydos.distance.Gilbert +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Gilbert + + +class GilbertTestCases(unittest.TestCase): + """Test Gilbert functions. + + abydos.distance.Gilbert + """ + + cmp = Gilbert() + cmp_no_d = Gilbert(alphabet=0) + + def test_gilbert_sim(self): + """Test abydos.distance.Gilbert.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4984005118362124) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.664957265) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.664957265) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.664957265) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.664957265) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7474691699 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual( + self.cmp_no_d.sim('abcd', 'efgh'), 0.33333333333333337 + ) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4302325581 + ) + + def test_gilbert_dist(self): + """Test abydos.distance.Gilbert.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5015994881637875) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.335042735) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.335042735) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.335042735) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.335042735) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2525308301 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.6666666666666666 + ) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5697674419 + ) + + def test_gilbert_corr(self): + """Test abydos.distance.Gilbert.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.003198976327575176) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.3299145299) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.3299145299) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.3299145299) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.3299145299) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.4949383398 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual( + self.cmp_no_d.corr('abcd', 'efgh'), -0.3333333333333333 + ) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.2) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.2) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.2) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.2) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.1395348837 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_gilbert_wells.py b/tests/distance/test_distance_gilbert_wells.py new file mode 100644 index 000000000..6d8121278 --- /dev/null +++ b/tests/distance/test_distance_gilbert_wells.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_gilbert_wells. + +This module contains unit tests for abydos.distance.GilbertWells +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GilbertWells + + +class GilbertWellsTestCases(unittest.TestCase): + """Test GilbertWells functions. + + abydos.distance.GilbertWells + """ + + cmp = GilbertWells() + cmp_no_d = GilbertWells(alphabet=0) + + def test_gilbert_wells_sim(self): + """Test abydos.distance.GilbertWells.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.028716013247135602) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3776594411) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3776594411) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3776594411) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3776594411) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4950086952 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual( + self.cmp_no_d.sim('abcd', 'efgh'), 0.13486136169765683 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.0255856715 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.0255856715 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.0255856715 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.0255856715 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0153237873 + ) + + def test_gilbert_wells_dist(self): + """Test abydos.distance.GilbertWells.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9712839867528644) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6223405589) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6223405589) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6223405589) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6223405589) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5049913048 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.8651386383023432 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.9744143285 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.9744143285 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.9744143285 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.9744143285 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.9846762127 + ) + + def test_gilbert_wells_sim_score(self): + """Test abydos.distance.GilbertWells.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 76.91383873217538) + self.assertEqual(self.cmp.sim_score('a', ''), 40.179592442305186) + self.assertEqual(self.cmp.sim_score('', 'a'), 40.179592442305186) + self.assertEqual(self.cmp.sim_score('abc', ''), 39.4890060826051) + self.assertEqual(self.cmp.sim_score('', 'abc'), 39.4890060826051) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 49.00800898579118) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), 1.6845961909440712 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 25.6938443303 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 25.6938443303 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 25.6938443303 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 25.6938443303 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 55.2085412384 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), -36.04365338911715) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 70.9425768923849) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 70.9425768923849) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 71.63572407294485) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 71.63572407294485) + self.assertEqual( + self.cmp_no_d.sim_score('abc', 'abc'), 71.63572407294485 + ) + self.assertEqual( + self.cmp_no_d.sim_score('abcd', 'efgh'), 9.690984737859244 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), 1.8432222004 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), 1.8432222004 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), 1.8432222004 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), 1.8432222004 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 1.1132321566 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_gini_i.py b/tests/distance/test_distance_gini_i.py new file mode 100644 index 000000000..2c073d92a --- /dev/null +++ b/tests/distance/test_distance_gini_i.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_gini_i. + +This module contains unit tests for abydos.distance.GiniI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GiniI + + +class GiniITestCases(unittest.TestCase): + """Test GiniI functions. + + abydos.distance.GiniI + """ + + cmp = GiniI() + cmp_no_d = GiniI(alphabet=0) + + def test_gini_i_sim(self): + """Test abydos.distance.GiniI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.8742017879948869) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4967907573812552) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7479180013) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7479180013) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7479180013) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7479180013) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7970761293 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.6666666666666666) + self.assertEqual( + self.cmp_no_d.sim('abcd', 'efgh'), 2.220446049250313e-16 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.539317703 + ) + + def test_gini_i_dist(self): + """Test abydos.distance.GiniI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.1257982120051131) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5032092426187448) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2520819987) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2520819987) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2520819987) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2520819987) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2029238707 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.33333333333333337) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.9999999999999998 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.460682297 + ) + + def test_gini_i_corr(self): + """Test abydos.distance.GiniI.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.7484035759897738) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237489576) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4958360026) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4958360026) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4958360026) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4958360026) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.5941522586 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.33333333333333326) + self.assertEqual( + self.cmp_no_d.corr('abcd', 'efgh'), -0.9999999999999996 + ) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.078635406 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_gini_ii.py b/tests/distance/test_distance_gini_ii.py new file mode 100644 index 000000000..34565dd14 --- /dev/null +++ b/tests/distance/test_distance_gini_ii.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_gini_ii. + +This module contains unit tests for abydos.distance.GiniII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GiniII + + +class GiniIITestCases(unittest.TestCase): + """Test GiniII functions. + + abydos.distance.GiniII + """ + + cmp = GiniII() + cmp_no_d = GiniII(alphabet=0) + + def test_gini_ii_sim(self): + """Test abydos.distance.GiniII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.8742017879948869) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4967907573812552) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7479180013) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7479180013) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7479180013) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7479180013) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.805819926 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.6666666666666666) + self.assertEqual( + self.cmp_no_d.sim('abcd', 'efgh'), 2.220446049250313e-16 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4545454545 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5419463087 + ) + + def test_gini_ii_dist(self): + """Test abydos.distance.GiniII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.1257982120051131) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5032092426187448) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2520819987) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2520819987) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2520819987) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2520819987) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.194180074 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.33333333333333337) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.9999999999999998 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5454545455 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.4580536913 + ) + + def test_gini_ii_corr(self): + """Test abydos.distance.GiniII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.7484035759897738) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237489576) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4958360026) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4958360026) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4958360026) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4958360026) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.611639852 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.33333333333333326) + self.assertEqual( + self.cmp_no_d.corr('abcd', 'efgh'), -0.9999999999999996 + ) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.0909090909 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.0838926174 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_goodall.py b/tests/distance/test_distance_goodall.py new file mode 100644 index 000000000..45a2b7242 --- /dev/null +++ b/tests/distance/test_distance_goodall.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_goodall. + +This module contains unit tests for abydos.distance.Goodall +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Goodall + + +class GoodallTestCases(unittest.TestCase): + """Test Goodall functions. + + abydos.distance.Goodall + """ + + cmp = Goodall() + cmp_no_d = Goodall(alphabet=0) + + def test_goodall_sim(self): + """Test abydos.distance.Goodall.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9678321591500222) + self.assertEqual(self.cmp.sim('', 'a'), 0.9678321591500222) + self.assertEqual(self.cmp.sim('abc', ''), 0.9544884026871964) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9544884026871964) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9279473952929225) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9442360891) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9442360891) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9442360891) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9442360891) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.939755208 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.391826552 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.391826552 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.391826552 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.391826552 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_goodall_dist(self): + """Test abydos.distance.Goodall.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.03216784084997781) + self.assertEqual(self.cmp.dist('', 'a'), 0.03216784084997781) + self.assertEqual(self.cmp.dist('abc', ''), 0.04551159731280363) + self.assertEqual(self.cmp.dist('', 'abc'), 0.04551159731280363) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.07205260470707753) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0557639109) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0557639109) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0557639109) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0557639109) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.060244792 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.608173448 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.608173448 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.608173448 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.608173448 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_goodman_kruskal_lambda.py b/tests/distance/test_distance_goodman_kruskal_lambda.py new file mode 100644 index 000000000..f4e834115 --- /dev/null +++ b/tests/distance/test_distance_goodman_kruskal_lambda.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_goodman_kruskal_lambda. + +This module contains unit tests for abydos.distance.GoodmanKruskalLambda +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GoodmanKruskalLambda + + +class GoodmanKruskalLambdaTestCases(unittest.TestCase): + """Test GoodmanKruskalLambda functions. + + abydos.distance.GoodmanKruskalLambda + """ + + cmp = GoodmanKruskalLambda() + cmp_no_d = GoodmanKruskalLambda(alphabet=0) + + def test_goodman_kruskal_lambda_sim(self): + """Test abydos.distance.GoodmanKruskalLambda.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_goodman_kruskal_lambda_dist(self): + """Test abydos.distance.GoodmanKruskalLambda.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_goodman_kruskal_lambda_r.py b/tests/distance/test_distance_goodman_kruskal_lambda_r.py new file mode 100644 index 000000000..7ece11f30 --- /dev/null +++ b/tests/distance/test_distance_goodman_kruskal_lambda_r.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_goodman_kruskal_lambda_r. + +This module contains unit tests for abydos.distance.GoodmanKruskalLambdaR +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GoodmanKruskalLambdaR + + +class GoodmanKruskalLambdaRTestCases(unittest.TestCase): + """Test GoodmanKruskalLambdaR functions. + + abydos.distance.GoodmanKruskalLambdaR + """ + + cmp = GoodmanKruskalLambdaR() + cmp_no_d = GoodmanKruskalLambdaR(alphabet=0) + + def test_goodman_kruskal_lambda_r_sim(self): + """Test abydos.distance.GoodmanKruskalLambdaR.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_goodman_kruskal_lambda_r_dist(self): + """Test abydos.distance.GoodmanKruskalLambdaR.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_goodman_kruskal_lambda_r_corr(self): + """Test abydos.distance.GoodmanKruskalLambdaR.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_goodman_kruskal_tau_a.py b/tests/distance/test_distance_goodman_kruskal_tau_a.py new file mode 100644 index 000000000..8a7bbedce --- /dev/null +++ b/tests/distance/test_distance_goodman_kruskal_tau_a.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_goodman_kruskal_tau_a. + +This module contains unit tests for abydos.distance.GoodmanKruskalTauA +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GoodmanKruskalTauA +from abydos.tokenizer import QGrams + + +class GoodmanKruskalTauATestCases(unittest.TestCase): + """Test GoodmanKruskalTauA functions. + + abydos.distance.GoodmanKruskalTauA + """ + + cmp = GoodmanKruskalTauA() + cmp_no_d = GoodmanKruskalTauA(alphabet=0) + + def test_goodman_kruskal_tau_a_sim(self): + """Test abydos.distance.GoodmanKruskalTauA.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.748403575989782) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 4.119695274745721e-05) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3290773882) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3290773882) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3290773882) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3290773882) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4593665172 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2437299035 + ) + + self.assertEqual( + GoodmanKruskalTauA( + intersection_type='linkage', + alphabet=64, + tokenizer=QGrams(qval=range(2, 4), skip=1), + ).sim('adhering', 'gilled'), + 0.0, + ) + self.assertEqual( + GoodmanKruskalTauA( + intersection_type='linkage', + alphabet=64, + tokenizer=QGrams(qval=range(2, 4), skip=1), + ).sim('gilled', 'adhering'), + 0.0, + ) + + def test_goodman_kruskal_tau_a_dist(self): + """Test abydos.distance.GoodmanKruskalTauA.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.251596424010218) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9999588030472526) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6709226118) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6709226118) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6709226118) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6709226118) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5406334828 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7562700965 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_goodman_kruskal_tau_b.py b/tests/distance/test_distance_goodman_kruskal_tau_b.py new file mode 100644 index 000000000..5947c0f33 --- /dev/null +++ b/tests/distance/test_distance_goodman_kruskal_tau_b.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_goodman_kruskal_tau_b. + +This module contains unit tests for abydos.distance.GoodmanKruskalTauB +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GoodmanKruskalTauB +from abydos.tokenizer import QGrams + + +class GoodmanKruskalTauBTestCases(unittest.TestCase): + """Test GoodmanKruskalTauB functions. + + abydos.distance.GoodmanKruskalTauB + """ + + cmp = GoodmanKruskalTauB() + cmp_no_d = GoodmanKruskalTauB(alphabet=0) + + def test_goodman_kruskal_tau_b_sim(self): + """Test abydos.distance.GoodmanKruskalTauB.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.748403575989782) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 4.119695274745721e-05) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3290773882) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3290773882) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3290773882) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3290773882) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4608002285 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2727272727 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2187412587 + ) + + self.assertEqual( + GoodmanKruskalTauB( + intersection_type='linkage', + alphabet=64, + tokenizer=QGrams(qval=range(2, 4), skip=1), + ).sim('adhering', 'gilled'), + 0.0, + ) + self.assertEqual( + GoodmanKruskalTauB( + intersection_type='linkage', + alphabet=64, + tokenizer=QGrams(qval=range(2, 4), skip=1), + ).sim('gilled', 'adhering'), + 0.0, + ) + + def test_goodman_kruskal_tau_b_dist(self): + """Test abydos.distance.GoodmanKruskalTauB.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.251596424010218) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9999588030472526) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6709226118) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6709226118) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6709226118) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6709226118) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5391997715 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7272727273 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7812587413 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_gotoh.py b/tests/distance/test_distance_gotoh.py index 244c3525e..85d27ea6d 100644 --- a/tests/distance/test_distance_gotoh.py +++ b/tests/distance/test_distance_gotoh.py @@ -44,49 +44,49 @@ class GotohTestCases(unittest.TestCase): abydos.distance.Gotoh """ - cmp = Gotoh() - nw = NeedlemanWunsch() - def test_gotoh_dist_abs(self): """Test abydos.distance.Gotoh.dist_abs.""" - self.assertEqual(gotoh('', ''), 0) + self.assertEqual(Gotoh().dist_abs('', ''), 0) # https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm self.assertEqual( - self.cmp.dist_abs('GATTACA', 'GCATGCU', 1, 1, _sim_nw), 0 + Gotoh(1, 1, _sim_nw).dist_abs('GATTACA', 'GCATGCU'), 0 ) self.assertGreaterEqual( - self.cmp.dist_abs('GATTACA', 'GCATGCU', 1, 0.5, _sim_nw), - self.nw.dist_abs('GATTACA', 'GCATGCU', 1, _sim_nw), + Gotoh(1, 0.5, _sim_nw).dist_abs('GATTACA', 'GCATGCU'), + NeedlemanWunsch(1, _sim_nw).dist_abs('GATTACA', 'GCATGCU'), ) self.assertEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, 5, _sim_wikipedia), - 16, + Gotoh(5, 5, _sim_wikipedia).dist_abs('AGACTAGTTAC', 'CGAGACGT'), 16 ) self.assertGreaterEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, 2, _sim_wikipedia), - self.nw.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, _sim_wikipedia), + Gotoh(5, 2, _sim_wikipedia).dist_abs('AGACTAGTTAC', 'CGAGACGT'), + NeedlemanWunsch(5, _sim_wikipedia).dist_abs( + 'AGACTAGTTAC', 'CGAGACGT' + ), ) # checked against http://ds9a.nl/nwunsch/ (mismatch=1, gap=5, skew=5) self.assertEqual( - self.cmp.dist_abs('CGATATCAG', 'TGACGSTGC', 5, 5, _sim_nw), -5 + Gotoh(5, 5, _sim_nw).dist_abs('CGATATCAG', 'TGACGSTGC'), -5 ) self.assertGreaterEqual( - self.cmp.dist_abs('CGATATCAG', 'TGACGSTGC', 5, 2, _sim_nw), - self.nw.dist_abs('CGATATCAG', 'TGACGSTGC', 5, _sim_nw), + Gotoh(5, 2, _sim_nw).dist_abs('CGATATCAG', 'TGACGSTGC'), + NeedlemanWunsch(5, _sim_nw).dist_abs('CGATATCAG', 'TGACGSTGC'), + ) + self.assertEqual( + Gotoh(5, 5, _sim_nw).dist_abs('AGACTAGTTAC', 'TGACGSTGC'), -7 ) - self.assertEqual(gotoh('AGACTAGTTAC', 'TGACGSTGC', 5, 5, _sim_nw), -7) self.assertGreaterEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'TGACGSTGC', 5, 2, _sim_nw), - self.nw.dist_abs('AGACTAGTTAC', 'TGACGSTGC', 5, _sim_nw), + Gotoh(5, 2, _sim_nw).dist_abs('AGACTAGTTAC', 'TGACGSTGC'), + NeedlemanWunsch(5, _sim_nw).dist_abs('AGACTAGTTAC', 'TGACGSTGC'), ) self.assertEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, 5, _sim_nw), -15 + Gotoh(5, 5, _sim_nw).dist_abs('AGACTAGTTAC', 'CGAGACGT'), -15 ) self.assertGreaterEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, 2, _sim_nw), - self.nw.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, _sim_nw), + Gotoh(5, 2, _sim_nw).dist_abs('AGACTAGTTAC', 'CGAGACGT'), + NeedlemanWunsch(5, _sim_nw).dist_abs('AGACTAGTTAC', 'CGAGACGT'), ) # Test wrapper @@ -98,20 +98,18 @@ def test_gotoh_dist_abs_nialls(self): """Test abydos.distance.Gotoh.dist_abs (Nialls set).""" # checked against http://ds9a.nl/nwunsch/ (mismatch=1, gap=2, skew=2) nw_vals = (5, 0, -2, 3, 1, 1, -2, -2, -1, -3, -3, -5, -3, -7, -7, -19) + g22 = Gotoh(2, 2, _sim_nw) for i in range(len(NIALL)): - self.assertEqual( - self.cmp.dist_abs(NIALL[0], NIALL[i], 2, 2, _sim_nw), - nw_vals[i], - ) + self.assertEqual(g22.dist_abs(NIALL[0], NIALL[i]), nw_vals[i]) nw_vals2 = (5, 0, -2, 3, 1, 1, -2, -2, -1, -2, -3, -3, -2, -6, -6, -8) + g21 = Gotoh(2, 1, _sim_nw) + g205 = Gotoh(2, 0.5, _sim_nw) + nw2 = NeedlemanWunsch(2, _sim_nw) for i in range(len(NIALL)): - self.assertEqual( - self.cmp.dist_abs(NIALL[0], NIALL[i], 2, 1, _sim_nw), - nw_vals2[i], - ) + self.assertEqual(g21.dist_abs(NIALL[0], NIALL[i]), nw_vals2[i]) self.assertGreaterEqual( - self.cmp.dist_abs(NIALL[0], NIALL[i], 2, 0.5, _sim_nw), - self.nw.dist_abs(NIALL[0], NIALL[i], 2, _sim_nw), + g205.dist_abs(NIALL[0], NIALL[i]), + nw2.dist_abs(NIALL[0], NIALL[i]), ) diff --git a/tests/distance/test_distance_gower_legendre.py b/tests/distance/test_distance_gower_legendre.py new file mode 100644 index 000000000..52cfc7770 --- /dev/null +++ b/tests/distance/test_distance_gower_legendre.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_gower_legendre. + +This module contains unit tests for abydos.distance.GowerLegendre +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GowerLegendre + + +class GowerLegendreTestCases(unittest.TestCase): + """Test GowerLegendre functions. + + abydos.distance.GowerLegendre + """ + + cmp = GowerLegendre() + cmp_no_d = GowerLegendre(alphabet=0) + + def test_gower_legendre_sim(self): + """Test abydos.distance.GowerLegendre.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987228607918263) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987228607918263) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974424552429667) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974424552429667) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.993581514762516) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961587708) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961587708) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961587708) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961587708) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9955156951 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + def test_gower_legendre_dist(self): + """Test abydos.distance.GowerLegendre.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('', 'a'), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('abc', ''), 0.002557544757033292) + self.assertEqual(self.cmp.dist('', 'abc'), 0.002557544757033292) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.006418485237484006) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038412292) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038412292) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038412292) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038412292) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0044843049 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_guttman_lambda_a.py b/tests/distance/test_distance_guttman_lambda_a.py new file mode 100644 index 000000000..f1203ca54 --- /dev/null +++ b/tests/distance/test_distance_guttman_lambda_a.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_guttman_lambda_a. + +This module contains unit tests for abydos.distance.GuttmanLambdaA +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GuttmanLambdaA + + +class GuttmanLambdaATestCases(unittest.TestCase): + """Test GuttmanLambdaA functions. + + abydos.distance.GuttmanLambdaA + """ + + cmp = GuttmanLambdaA() + cmp_no_d = GuttmanLambdaA(alphabet=0) + + def test_guttman_lambda_a_sim(self): + """Test abydos.distance.GuttmanLambdaA.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_guttman_lambda_a_dist(self): + """Test abydos.distance.GuttmanLambdaA.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_guttman_lambda_b.py b/tests/distance/test_distance_guttman_lambda_b.py new file mode 100644 index 000000000..0a1da26bd --- /dev/null +++ b/tests/distance/test_distance_guttman_lambda_b.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_guttman_lambda_b. + +This module contains unit tests for abydos.distance.GuttmanLambdaB +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GuttmanLambdaB + + +class GuttmanLambdaBTestCases(unittest.TestCase): + """Test GuttmanLambdaB functions. + + abydos.distance.GuttmanLambdaB + """ + + cmp = GuttmanLambdaB() + cmp_no_d = GuttmanLambdaB(alphabet=0) + + def test_guttman_lambda_b_sim(self): + """Test abydos.distance.GuttmanLambdaB.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_guttman_lambda_b_dist(self): + """Test abydos.distance.GuttmanLambdaB.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.7) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_gwet_ac.py b/tests/distance/test_distance_gwet_ac.py new file mode 100644 index 000000000..4967af7ad --- /dev/null +++ b/tests/distance/test_distance_gwet_ac.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_gwet_ac. + +This module contains unit tests for abydos.distance.GwetAC +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import GwetAC + + +class GwetACTestCases(unittest.TestCase): + """Test GwetAC functions. + + abydos.distance.GwetAC + """ + + cmp = GwetAC() + cmp_no_d = GwetAC(alphabet=0) + + def test_gwet_ac_sim(self): + """Test abydos.distance.GwetAC.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987212317930483) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987212317930483) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974359309794483) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974359309794483) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9935405839180314) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961144519) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961144519) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961144519) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961144519) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9954145343 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.6 + ) + + def test_gwet_ac_dist(self): + """Test abydos.distance.GwetAC.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0012787682069517192) + self.assertEqual(self.cmp.dist('', 'a'), 0.0012787682069517192) + self.assertEqual(self.cmp.dist('abc', ''), 0.002564069020551729) + self.assertEqual(self.cmp.dist('', 'abc'), 0.002564069020551729) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.00645941608196865) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038855481) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038855481) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038855481) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038855481) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0045854657 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.4 + ) + + def test_gwet_ac_corr(self): + """Test abydos.distance.GwetAC.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.9974424635860967) + self.assertEqual(self.cmp.corr('', 'a'), 0.9974424635860967) + self.assertEqual(self.cmp.corr('abc', ''), 0.9948718619588964) + self.assertEqual(self.cmp.corr('', 'abc'), 0.9948718619588964) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), 0.9870811678360627) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9922289037) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9922289037) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9922289037) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9922289037) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9908290686 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.2) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.2) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.2) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.2) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.2 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_hamann.py b/tests/distance/test_distance_hamann.py new file mode 100644 index 000000000..d72785a59 --- /dev/null +++ b/tests/distance/test_distance_hamann.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_hamann. + +This module contains unit tests for abydos.distance.Hamann +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Hamann + + +class HamannTestCases(unittest.TestCase): + """Test Hamann functions. + + abydos.distance.Hamann + """ + + cmp = Hamann() + cmp_no_d = Hamann(alphabet=0) + + def test_hamann_sim(self): + """Test abydos.distance.Hamann.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.sim('', 'a'), 0.9974489795918368) + self.assertEqual(self.cmp.sim('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9872448979591837) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923469388) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9910714286 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_hamann_dist(self): + """Test abydos.distance.Hamann.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0025510204081632404) + self.assertEqual(self.cmp.dist('', 'a'), 0.0025510204081632404) + self.assertEqual(self.cmp.dist('abc', ''), 0.005102040816326481) + self.assertEqual(self.cmp.dist('', 'abc'), 0.005102040816326481) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816313) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076530612) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0089285714 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_hamann_corr(self): + """Test abydos.distance.Hamann.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.9948979591836735) + self.assertEqual(self.cmp.corr('', 'a'), 0.9948979591836735) + self.assertEqual(self.cmp.corr('abc', ''), 0.9897959183673469) + self.assertEqual(self.cmp.corr('', 'abc'), 0.9897959183673469) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), 0.9744897959183674) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9846938776) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9846938776) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9821428571 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_hamming.py b/tests/distance/test_distance_hamming.py index 828b5af7c..42ad19202 100644 --- a/tests/distance/test_distance_hamming.py +++ b/tests/distance/test_distance_hamming.py @@ -40,24 +40,25 @@ class HammingTestCases(unittest.TestCase): """ cmp = Hamming() + cmp_no_diff = Hamming(False) def test_hamming_dist_abs(self): """Test abydos.distance.Hamming.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) - self.assertEqual(self.cmp.dist_abs('', '', False), 0) + self.assertEqual(self.cmp_no_diff.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('a', ''), 1) self.assertEqual(self.cmp.dist_abs('a', 'a'), 0) - self.assertEqual(self.cmp.dist_abs('a', 'a', False), 0) + self.assertEqual(self.cmp_no_diff.dist_abs('a', 'a'), 0) self.assertEqual(self.cmp.dist_abs('a', 'b'), 1) - self.assertEqual(self.cmp.dist_abs('a', 'b', False), 1) + self.assertEqual(self.cmp_no_diff.dist_abs('a', 'b'), 1) self.assertEqual(self.cmp.dist_abs('abc', 'cba'), 2) - self.assertEqual(self.cmp.dist_abs('abc', 'cba', False), 2) + self.assertEqual(self.cmp_no_diff.dist_abs('abc', 'cba'), 2) self.assertEqual(self.cmp.dist_abs('abc', ''), 3) self.assertEqual(self.cmp.dist_abs('bb', 'cbab'), 3) # test exception - self.assertRaises(ValueError, self.cmp.dist_abs, 'ab', 'a', False) + self.assertRaises(ValueError, self.cmp_no_diff.dist_abs, 'ab', 'a') # https://en.wikipedia.org/wiki/Hamming_distance self.assertEqual(self.cmp.dist_abs('karolin', 'kathrin'), 3) @@ -66,25 +67,25 @@ def test_hamming_dist_abs(self): self.assertEqual(self.cmp.dist_abs('2173896', '2233796'), 3) # Test wrapper - self.assertEqual(hamming('abc', 'cba', False), 2) + self.assertEqual(hamming('abc', 'cba'), 2) def test_hamming_dist(self): """Test abydos.distance.Hamming.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) - self.assertEqual(self.cmp.dist('', '', False), 0) + self.assertEqual(self.cmp_no_diff.dist('', ''), 0) self.assertEqual(self.cmp.dist('a', ''), 1) self.assertEqual(self.cmp.dist('a', 'a'), 0) - self.assertEqual(self.cmp.dist('a', 'a', False), 0) + self.assertEqual(self.cmp_no_diff.dist('a', 'a'), 0) self.assertEqual(self.cmp.dist('a', 'b'), 1) - self.assertEqual(self.cmp.dist('a', 'b', False), 1) + self.assertEqual(self.cmp_no_diff.dist('a', 'b'), 1) self.assertAlmostEqual(self.cmp.dist('abc', 'cba'), 2 / 3) - self.assertAlmostEqual(self.cmp.dist('abc', 'cba', False), 2 / 3) + self.assertAlmostEqual(self.cmp_no_diff.dist('abc', 'cba'), 2 / 3) self.assertEqual(self.cmp.dist('abc', ''), 1) self.assertAlmostEqual(self.cmp.dist('bb', 'cbab'), 3 / 4) # test exception - self.assertRaises(ValueError, self.cmp.dist, 'ab', 'a', False) + self.assertRaises(ValueError, self.cmp_no_diff.dist, 'ab', 'a') # https://en.wikipedia.org/wiki/Hamming_distance self.assertAlmostEqual(self.cmp.dist('karolin', 'kathrin'), 3 / 7) @@ -98,20 +99,20 @@ def test_hamming_dist(self): def test_hamming_sim(self): """Test abydos.distance.Hamming.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) - self.assertEqual(self.cmp.sim('', '', False), 1) + self.assertEqual(self.cmp_no_diff.sim('', ''), 1) self.assertEqual(self.cmp.sim('a', ''), 0) self.assertEqual(self.cmp.sim('a', 'a'), 1) - self.assertEqual(self.cmp.sim('a', 'a', False), 1) + self.assertEqual(self.cmp_no_diff.sim('a', 'a'), 1) self.assertEqual(self.cmp.sim('a', 'b'), 0) - self.assertEqual(self.cmp.sim('a', 'b', False), 0) + self.assertEqual(self.cmp_no_diff.sim('a', 'b'), 0) self.assertAlmostEqual(self.cmp.sim('abc', 'cba'), 1 / 3) - self.assertAlmostEqual(self.cmp.sim('abc', 'cba', False), 1 / 3) + self.assertAlmostEqual(self.cmp_no_diff.sim('abc', 'cba'), 1 / 3) self.assertEqual(self.cmp.sim('abc', ''), 0) self.assertAlmostEqual(self.cmp.sim('bb', 'cbab'), 1 / 4) # test exception - self.assertRaises(ValueError, self.cmp.sim, 'ab', 'a', False) + self.assertRaises(ValueError, self.cmp_no_diff.sim, 'ab', 'a') # https://en.wikipedia.org/wiki/Hamming_distance self.assertAlmostEqual(self.cmp.sim('karolin', 'kathrin'), 4 / 7) diff --git a/tests/distance/test_distance_harris_lahey.py b/tests/distance/test_distance_harris_lahey.py new file mode 100644 index 000000000..6391b87ff --- /dev/null +++ b/tests/distance/test_distance_harris_lahey.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_harris_lahey. + +This module contains unit tests for abydos.distance.HarrisLahey +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import HarrisLahey + + +class HarrisLaheyTestCases(unittest.TestCase): + """Test HarrisLahey functions. + + abydos.distance.HarrisLahey + """ + + cmp = HarrisLahey() + cmp_no_d = HarrisLahey(alphabet=0) + + def test_harris_lahey_sim(self): + """Test abydos.distance.HarrisLahey.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0012722563515202) + self.assertEqual(self.cmp.sim('', 'a'), 0.0012722563515202) + self.assertEqual(self.cmp.sim('abc', ''), 0.0025380049979175346) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0025380049979175346) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.006296204706372345) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3383765798) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3383765798) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3383765798) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3383765798) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5065757722 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.1111111111 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.125 + ) + + def test_harris_lahey_dist(self): + """Test abydos.distance.HarrisLahey.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.9987277436484798) + self.assertEqual(self.cmp.dist('', 'a'), 0.9987277436484798) + self.assertEqual(self.cmp.dist('abc', ''), 0.9974619950020824) + self.assertEqual(self.cmp.dist('', 'abc'), 0.9974619950020824) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9937037952936276) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6616234202) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6616234202) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6616234202) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6616234202) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4934242278 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.8888888889 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.8888888889 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.8888888889 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.8888888889 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.875 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_hassanat.py b/tests/distance/test_distance_hassanat.py new file mode 100644 index 000000000..c183ab8e5 --- /dev/null +++ b/tests/distance/test_distance_hassanat.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_hassanat. + +This module contains unit tests for abydos.distance.Hassanat +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from collections import Counter + +from abydos.distance import Hassanat + + +class HassanatTestCases(unittest.TestCase): + """Test Hassanat functions. + + abydos.distance.Hassanat + """ + + cmp = Hassanat() + + def test_hassanat_dist(self): + """Test abydos.distance.Hassanat.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.25) + + def test_hassanat_sim(self): + """Test abydos.distance.Hassanat.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.75) + + def test_hassanat_dist_abs(self): + """Test abydos.distance.Hassanat.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 2.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 2.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 5.0) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 3.0) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 3.0) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 3.0) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 3.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 3.5 + ) + + self.assertAlmostEqual( + self.cmp.dist_abs( + Counter({'a': -4, 'b': -2}), Counter({'a': -2, 'b': 4}) + ), + 0.8571428571428572, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_hawkins_dotson.py b/tests/distance/test_distance_hawkins_dotson.py new file mode 100644 index 000000000..26460500d --- /dev/null +++ b/tests/distance/test_distance_hawkins_dotson.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_hawkins_dotson. + +This module contains unit tests for abydos.distance.HawkinsDotson +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import HawkinsDotson + + +class HawkinsDotsonTestCases(unittest.TestCase): + """Test HawkinsDotson functions. + + abydos.distance.HawkinsDotson + """ + + cmp = HawkinsDotson() + cmp_no_d = HawkinsDotson(alphabet=0) + + def test_hawkins_dotson_sim(self): + """Test abydos.distance.HawkinsDotson.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.4987244897959184) + self.assertEqual(self.cmp.sim('', 'a'), 0.4987244897959184) + self.assertEqual(self.cmp.sim('abc', ''), 0.49744897959183676) + self.assertEqual(self.cmp.sim('', 'abc'), 0.49744897959183676) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.49362244897959184) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6628254375) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6628254375) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6628254375) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6628254375) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7454954955 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.25 + ) + + def test_hawkins_dotson_dist(self): + """Test abydos.distance.HawkinsDotson.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5012755102040816) + self.assertEqual(self.cmp.dist('', 'a'), 0.5012755102040816) + self.assertEqual(self.cmp.dist('abc', ''), 0.5025510204081632) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5025510204081632) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5063775510204082) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3371745625) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3371745625) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3371745625) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3371745625) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2545045045 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.75 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_hellinger.py b/tests/distance/test_distance_hellinger.py new file mode 100644 index 000000000..5a1e6edb4 --- /dev/null +++ b/tests/distance/test_distance_hellinger.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_hellinger. + +This module contains unit tests for abydos.distance.Hellinger +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Hellinger + + +class HellingerTestCases(unittest.TestCase): + """Test Hellinger functions. + + abydos.distance.Hellinger + """ + + cmp = Hellinger() + + def test_hellinger_dist(self): + """Test abydos.distance.Hellinger.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8164965809) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8164965809) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8164965809) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8164965809) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.7071067812 + ) + + def test_hellinger_sim(self): + """Test abydos.distance.Hellinger.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1835034191) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1835034191) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1835034191) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1835034191) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.2928932188 + ) + + def test_hellinger_dist_abs(self): + """Test abydos.distance.Hellinger.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 2.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 2.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 2.8284271247461903) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 2.8284271247461903) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4.47213595499958) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 3.4641016151 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 3.4641016151 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 3.4641016151 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 3.4641016151 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 3.7416573868 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_higuera_mico.py b/tests/distance/test_distance_higuera_mico.py new file mode 100644 index 000000000..fea29b557 --- /dev/null +++ b/tests/distance/test_distance_higuera_mico.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_higuera_mico. + +This module contains unit tests for abydos.distance.HigueraMico +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import HigueraMico + + +class HigueraMicoTestCases(unittest.TestCase): + """Test HigueraMico functions. + + abydos.distance.HigueraMico + """ + + cmp = HigueraMico() + + def test_higuera_mico_dist(self): + """Test abydos.distance.HigueraMico.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_higuera_mico_sim(self): + """Test abydos.distance.HigueraMico.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_higuera_mico_dist_abs(self): + """Test abydos.distance.HigueraMico.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 1.8333333333333333) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 1.8333333333333333) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_hurlbert.py b/tests/distance/test_distance_hurlbert.py new file mode 100644 index 000000000..52cb0229f --- /dev/null +++ b/tests/distance/test_distance_hurlbert.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_hurlbert. + +This module contains unit tests for abydos.distance.Hurlbert +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Hurlbert +from abydos.tokenizer import VCClusterTokenizer + + +class HurlbertTestCases(unittest.TestCase): + """Test Hurlbert functions. + + abydos.distance.Hurlbert + """ + + cmp = Hurlbert() + cmp_no_d = Hurlbert(alphabet=0) + + def test_hurlbert_sim(self): + """Test abydos.distance.Hurlbert.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.748049385) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.748049385) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.748049385) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.748049385) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.815793032 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_hurlbert_dist(self): + """Test abydos.distance.Hurlbert.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.251950615) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.251950615) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.251950615) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.251950615) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.184206968 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_hurlbert_corr(self): + """Test abydos.distance.Hurlbert.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.49609877) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.49609877) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.49609877) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.49609877) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.631586064 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + self.assertEqual( + Hurlbert(alphabet=0, tokenizer=VCClusterTokenizer()).corr( + 'a', 'eh' + ), + 0.0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_iterative_substring.py b/tests/distance/test_distance_iterative_substring.py new file mode 100644 index 000000000..efa0b45e7 --- /dev/null +++ b/tests/distance/test_distance_iterative_substring.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_iterative_substring. + +This module contains unit tests for abydos.distance.IterativeSubString +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import IterativeSubString + + +class IterativeSubStringTestCases(unittest.TestCase): + """Test IterativeSubString functions. + + abydos.distance.IterativeSubString + """ + + cmp = IterativeSubString() + cmp_norm = IterativeSubString(normalize_strings=True) + + def test_iterative_substring_sim(self): + """Test abydos.distance.IterativeSubString.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6618421053 + ) + + def test_iterative_substring_dist(self): + """Test abydos.distance.IterativeSubString.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3381578947 + ) + + def test_iterative_substring_corr(self): + """Test abydos.distance.IterativeSubString.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), -0.8) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), -0.8) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), -0.8) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), -0.8) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.3236842105 + ) + self.assertAlmostEqual( + self.cmp_norm.corr('ATCAACGAGT', 'AACGATTAG'), 0.3236842105 + ) + self.assertAlmostEqual( + self.cmp_norm.corr('ATC..AACGAGT', 'AA_CGAT_TAG'), 0.3236842105 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_jaccard.py b/tests/distance/test_distance_jaccard.py index 56b883fe7..b7b33c7fc 100644 --- a/tests/distance/test_distance_jaccard.py +++ b/tests/distance/test_distance_jaccard.py @@ -32,7 +32,7 @@ import unittest from abydos.distance import Jaccard, dist_jaccard, sim_jaccard, tanimoto -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -44,6 +44,8 @@ class JaccardTestCases(unittest.TestCase): """ cmp = Jaccard() + cmp_q2 = Jaccard(tokenizer=QGrams(2)) + cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer()) def test_jaccard_sim(self): """Test abydos.distance.Jaccard.sim.""" @@ -52,25 +54,47 @@ def test_jaccard_sim(self): self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) - self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 4 / 11) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) + self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 4 / 11 + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 4 / 11, ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) - self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 / 3) - self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 / 3) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3) # Test wrapper self.assertAlmostEqual(sim_jaccard('nelson', 'neilsen'), 4 / 11) @@ -82,25 +106,47 @@ def test_jaccard_dist(self): self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) - self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 2), 7 / 11) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) + self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 7 / 11 + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7 / 11, ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) - self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 2 / 3) - self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 2 / 3) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3) # Test wrapper self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7 / 11) @@ -113,6 +159,8 @@ class TanimotoTestCases(unittest.TestCase): """ cmp = Jaccard() + cmp_q2 = Jaccard(tokenizer=QGrams(2)) + cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer()) def test_jaccard_tanimoto_coeff(self): """Test abydos.distance.Jaccard.tanimoto_coeff.""" @@ -123,46 +171,61 @@ def test_jaccard_tanimoto_coeff(self): self.cmp.tanimoto_coeff('nelson', 'neilsen'), math.log(4 / 11, 2) ) - self.assertEqual(self.cmp.tanimoto_coeff('', '', 2), 0) + self.assertEqual(self.cmp_q2.tanimoto_coeff('', ''), 0) self.assertEqual( - self.cmp.tanimoto_coeff('nelson', '', 2), float('-inf') + self.cmp_q2.tanimoto_coeff('nelson', ''), float('-inf') ) self.assertEqual( - self.cmp.tanimoto_coeff('', 'neilsen', 2), float('-inf') + self.cmp_q2.tanimoto_coeff('', 'neilsen'), float('-inf') ) self.assertAlmostEqual( - self.cmp.tanimoto_coeff('nelson', 'neilsen', 2), + self.cmp_q2.tanimoto_coeff('nelson', 'neilsen'), math.log(4 / 11, 2), ) # supplied q-gram tests - self.assertEqual(self.cmp.tanimoto_coeff(QGrams(''), QGrams('')), 0) self.assertEqual( - self.cmp.tanimoto_coeff(QGrams('nelson'), QGrams('')), + self.cmp.tanimoto_coeff( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.tanimoto_coeff( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), float('-inf'), ) self.assertEqual( - self.cmp.tanimoto_coeff(QGrams(''), QGrams('neilsen')), + self.cmp.tanimoto_coeff( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), float('-inf'), ) self.assertAlmostEqual( - self.cmp.tanimoto_coeff(QGrams('nelson'), QGrams('neilsen')), + self.cmp.tanimoto_coeff( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), math.log(4 / 11, 2), ) # non-q-gram tests - self.assertEqual(self.cmp.tanimoto_coeff('', '', 0), 0) + self.assertEqual(self.cmp_ws.tanimoto_coeff('', ''), 0) self.assertEqual( - self.cmp.tanimoto_coeff('the quick', '', 0), float('-inf') + self.cmp_ws.tanimoto_coeff('the quick', ''), float('-inf') ) self.assertEqual( - self.cmp.tanimoto_coeff('', 'the quick', 0), float('-inf') + self.cmp_ws.tanimoto_coeff('', 'the quick'), float('-inf') ) self.assertAlmostEqual( - self.cmp.tanimoto_coeff(NONQ_FROM, NONQ_TO, 0), math.log(1 / 3, 2) + self.cmp_ws.tanimoto_coeff(NONQ_FROM, NONQ_TO), math.log(1 / 3, 2) ) self.assertAlmostEqual( - self.cmp.tanimoto_coeff(NONQ_TO, NONQ_FROM, 0), math.log(1 / 3, 2) + self.cmp_ws.tanimoto_coeff(NONQ_TO, NONQ_FROM), math.log(1 / 3, 2) ) # Test wrapper diff --git a/tests/distance/test_distance_jaccard_nm.py b/tests/distance/test_distance_jaccard_nm.py new file mode 100644 index 000000000..f0f9a0858 --- /dev/null +++ b/tests/distance/test_distance_jaccard_nm.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_jaccard_nm. + +This module contains unit tests for abydos.distance.JaccardNM +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import JaccardNM + + +class JaccardNMTestCases(unittest.TestCase): + """Test JaccardNM functions. + + abydos.distance.JaccardNM + """ + + cmp = JaccardNM() + cmp_no_d = JaccardNM(alphabet=0) + + def test_jaccard_nm_sim(self): + """Test abydos.distance.JaccardNM.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.01015228426395939) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0075662043) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0075662043) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0075662043) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0075662043) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0175438596 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_jaccard_nm_dist(self): + """Test abydos.distance.JaccardNM.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.9898477157360406) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9924337957) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9924337957) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9924337957) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9924337957) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.9824561404 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_jaccard_nm_sim_score(self): + """Test abydos.distance.JaccardNM.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual( + self.cmp.sim_score('abc', 'abc'), 0.005076142131979695 + ) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 0.0037831021 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 0.0037831021 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 0.0037831021 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 0.0037831021 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.0087719298 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.25 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_jaro_winkler.py b/tests/distance/test_distance_jaro_winkler.py index 6930344a8..887da4185 100644 --- a/tests/distance/test_distance_jaro_winkler.py +++ b/tests/distance/test_distance_jaro_winkler.py @@ -39,72 +39,67 @@ class JaroWinklerTestCases(unittest.TestCase): abydos.distance.JaroWinkler """ - cmp = JaroWinkler() + jaro = JaroWinkler(mode='jaro') + jaro_winkler = JaroWinkler(mode='winkler') def test_sim_jaro_winkler(self): """Test abydos.distance.JaroWinkler.sim.""" - self.assertEqual(self.cmp.sim('', '', mode='jaro'), 1) - self.assertEqual(self.cmp.sim('', '', mode='winkler'), 1) - self.assertEqual(self.cmp.sim('MARTHA', '', mode='jaro'), 0) - self.assertEqual(self.cmp.sim('MARTHA', '', mode='winkler'), 0) - self.assertEqual(self.cmp.sim('', 'MARHTA', mode='jaro'), 0) - self.assertEqual(self.cmp.sim('', 'MARHTA', mode='winkler'), 0) - self.assertEqual(self.cmp.sim('MARTHA', 'MARTHA', mode='jaro'), 1) - self.assertEqual(self.cmp.sim('MARTHA', 'MARTHA', mode='winkler'), 1) + self.assertEqual(self.jaro.sim('', ''), 1) + self.assertEqual(self.jaro_winkler.sim('', ''), 1) + self.assertEqual(self.jaro.sim('MARTHA', ''), 0) + self.assertEqual(self.jaro_winkler.sim('MARTHA', ''), 0) + self.assertEqual(self.jaro.sim('', 'MARHTA'), 0) + self.assertEqual(self.jaro_winkler.sim('', 'MARHTA'), 0) + self.assertEqual(self.jaro.sim('MARTHA', 'MARTHA'), 1) + self.assertEqual(self.jaro_winkler.sim('MARTHA', 'MARTHA'), 1) # https://en.wikipedia.org/wiki/Jaro-Winkler_distance + self.assertAlmostEqual(self.jaro.sim('MARTHA', 'MARHTA'), 0.94444444) self.assertAlmostEqual( - self.cmp.sim('MARTHA', 'MARHTA', mode='jaro'), 0.94444444 + self.jaro_winkler.sim('MARTHA', 'MARHTA'), 0.96111111 ) + self.assertAlmostEqual(self.jaro.sim('DWAYNE', 'DUANE'), 0.82222222) + self.assertAlmostEqual(self.jaro_winkler.sim('DWAYNE', 'DUANE'), 0.84) + self.assertAlmostEqual(self.jaro.sim('DIXON', 'DICKSONX'), 0.76666666) self.assertAlmostEqual( - self.cmp.sim('MARTHA', 'MARHTA', mode='winkler'), 0.96111111 - ) - self.assertAlmostEqual( - self.cmp.sim('DWAYNE', 'DUANE', mode='jaro'), 0.82222222 - ) - self.assertAlmostEqual( - self.cmp.sim('DWAYNE', 'DUANE', mode='winkler'), 0.84 - ) - self.assertAlmostEqual( - self.cmp.sim('DIXON', 'DICKSONX', mode='jaro'), 0.76666666 - ) - self.assertAlmostEqual( - self.cmp.sim('DIXON', 'DICKSONX', mode='winkler'), 0.81333333 + self.jaro_winkler.sim('DIXON', 'DICKSONX'), 0.81333333 ) self.assertRaises( - ValueError, self.cmp.sim, 'abcd', 'dcba', boost_threshold=2 + ValueError, JaroWinkler(boost_threshold=2).sim, 'abcd', 'dcba' ) self.assertRaises( - ValueError, self.cmp.sim, 'abcd', 'dcba', boost_threshold=-1 + ValueError, JaroWinkler(boost_threshold=-1).sim, 'abcd', 'dcba' ) self.assertRaises( - ValueError, self.cmp.sim, 'abcd', 'dcba', scaling_factor=0.3 + ValueError, JaroWinkler(scaling_factor=0.3).sim, 'abcd', 'dcba' ) self.assertRaises( - ValueError, self.cmp.sim, 'abcd', 'dcba', scaling_factor=-1 + ValueError, JaroWinkler(scaling_factor=-1).sim, 'abcd', 'dcba' ) - self.assertAlmostEqual(self.cmp.sim('ABCD', 'EFGH'), 0.0) + self.assertAlmostEqual(self.jaro_winkler.sim('ABCD', 'EFGH'), 0.0) # long_strings = True (applies only to Jaro-Winkler, not Jaro) self.assertEqual( - self.cmp.sim('ABCD', 'EFGH', long_strings=True), - self.cmp.sim('ABCD', 'EFGH'), + JaroWinkler(long_strings=True).sim('ABCD', 'EFGH'), + self.jaro.sim('ABCD', 'EFGH'), ) self.assertEqual( - self.cmp.sim('DIXON', 'DICKSONX', mode='jaro', long_strings=True), - self.cmp.sim('DIXON', 'DICKSONX', mode='jaro'), + JaroWinkler(mode='jaro', long_strings=True).sim( + 'DIXON', 'DICKSONX' + ), + self.jaro.sim('DIXON', 'DICKSONX'), ) self.assertAlmostEqual( - self.cmp.sim( - 'DIXON', 'DICKSONX', mode='winkler', long_strings=True + JaroWinkler(mode='winkler', long_strings=True).sim( + 'DIXON', 'DICKSONX' ), 0.83030303, ) self.assertAlmostEqual( - self.cmp.sim( - 'MARTHA', 'MARHTA', mode='winkler', long_strings=True + JaroWinkler(mode='winkler', long_strings=True).sim( + 'MARTHA', 'MARHTA' ), 0.97083333, ) @@ -119,49 +114,41 @@ def test_sim_jaro_winkler(self): def test_dist_jaro_winkler(self): """Test abydos.distance.JaroWinkler.dist.""" - self.assertEqual(self.cmp.dist('', '', mode='jaro'), 0) - self.assertEqual(self.cmp.dist('', '', mode='winkler'), 0) - self.assertEqual(self.cmp.dist('MARTHA', '', mode='jaro'), 1) - self.assertEqual(self.cmp.dist('MARTHA', '', mode='winkler'), 1) - self.assertEqual(self.cmp.dist('', 'MARHTA', mode='jaro'), 1) - self.assertEqual(self.cmp.dist('', 'MARHTA', mode='winkler'), 1) - self.assertEqual(self.cmp.dist('MARTHA', 'MARTHA', mode='jaro'), 0) - self.assertEqual(self.cmp.dist('MARTHA', 'MARTHA', mode='winkler'), 0) + self.assertEqual(self.jaro.dist('', ''), 0) + self.assertEqual(self.jaro_winkler.dist('', ''), 0) + self.assertEqual(self.jaro.dist('MARTHA', ''), 1) + self.assertEqual(self.jaro_winkler.dist('MARTHA', ''), 1) + self.assertEqual(self.jaro.dist('', 'MARHTA'), 1) + self.assertEqual(self.jaro_winkler.dist('', 'MARHTA'), 1) + self.assertEqual(self.jaro.dist('MARTHA', 'MARTHA'), 0) + self.assertEqual(self.jaro_winkler.dist('MARTHA', 'MARTHA'), 0) # https://en.wikipedia.org/wiki/Jaro-Winkler_distance + self.assertAlmostEqual(self.jaro.dist('MARTHA', 'MARHTA'), 0.05555555) self.assertAlmostEqual( - self.cmp.dist('MARTHA', 'MARHTA', mode='jaro'), 0.05555555 - ) - self.assertAlmostEqual( - self.cmp.dist('MARTHA', 'MARHTA', mode='winkler'), 0.03888888 - ) - self.assertAlmostEqual( - self.cmp.dist('DWAYNE', 'DUANE', mode='jaro'), 0.17777777 - ) - self.assertAlmostEqual( - self.cmp.dist('DWAYNE', 'DUANE', mode='winkler'), 0.16 - ) - self.assertAlmostEqual( - self.cmp.dist('DIXON', 'DICKSONX', mode='jaro'), 0.23333333 + self.jaro_winkler.dist('MARTHA', 'MARHTA'), 0.03888888 ) + self.assertAlmostEqual(self.jaro.dist('DWAYNE', 'DUANE'), 0.17777777) + self.assertAlmostEqual(self.jaro_winkler.dist('DWAYNE', 'DUANE'), 0.16) + self.assertAlmostEqual(self.jaro.dist('DIXON', 'DICKSONX'), 0.23333333) self.assertAlmostEqual( - self.cmp.dist('DIXON', 'DICKSONX', mode='winkler'), 0.18666666 + self.jaro_winkler.dist('DIXON', 'DICKSONX'), 0.18666666 ) self.assertRaises( - ValueError, self.cmp.dist, 'abcd', 'dcba', boost_threshold=2 + ValueError, JaroWinkler(boost_threshold=2).dist, 'abcd', 'dcba' ) self.assertRaises( - ValueError, self.cmp.dist, 'abcd', 'dcba', boost_threshold=-1 + ValueError, JaroWinkler(boost_threshold=-1).dist, 'abcd', 'dcba' ) self.assertRaises( - ValueError, self.cmp.dist, 'abcd', 'dcba', scaling_factor=0.3 + ValueError, JaroWinkler(scaling_factor=0.3).dist, 'abcd', 'dcba' ) self.assertRaises( - ValueError, self.cmp.dist, 'abcd', 'dcba', scaling_factor=-1 + ValueError, JaroWinkler(scaling_factor=-1).dist, 'abcd', 'dcba' ) - self.assertAlmostEqual(self.cmp.dist('ABCD', 'EFGH'), 1.0) + self.assertAlmostEqual(self.jaro_winkler.dist('ABCD', 'EFGH'), 1.0) # Test wrapper self.assertAlmostEqual( diff --git a/tests/distance/test_distance_jensen_shannon.py b/tests/distance/test_distance_jensen_shannon.py new file mode 100644 index 000000000..54ff60163 --- /dev/null +++ b/tests/distance/test_distance_jensen_shannon.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_jensen_shannon. + +This module contains unit tests for abydos.distance.JensenShannon +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import JensenShannon + + +class JensenShannonTestCases(unittest.TestCase): + """Test JensenShannon functions. + + abydos.distance.JensenShannon + """ + + cmp = JensenShannon() + + def test_jensen_shannon_dist(self): + """Test abydos.distance.JensenShannon.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.332911546 + ) + + def test_jensen_shannon_sim(self): + """Test abydos.distance.JensenShannon.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.667088454 + ) + + def test_jensen_shannon_dist_abs(self): + """Test abydos.distance.JensenShannon.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 0.6931471805599453) + self.assertEqual(self.cmp.dist_abs('', 'a'), 0.6931471805599453) + self.assertEqual(self.cmp.dist_abs('abc', ''), 0.6931471805599453) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 0.6931471805599453) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 0.6931471805599453) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 0.3465735903 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 0.3465735903 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 0.3465735903 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 0.3465735903 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.2307566995 + ) + + self.assertEqual( + JensenShannon(intersection_type='soft', qval=2).dist_abs( + 'a', 'eh' + ), + 0.6931471805599453, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_johnson.py b/tests/distance/test_distance_johnson.py new file mode 100644 index 000000000..349c6084b --- /dev/null +++ b/tests/distance/test_distance_johnson.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_johnson. + +This module contains unit tests for abydos.distance.Johnson +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Johnson + + +class JohnsonTestCases(unittest.TestCase): + """Test Johnson functions. + + abydos.distance.Johnson + """ + + cmp = Johnson() + + def test_johnson_sim(self): + """Test abydos.distance.Johnson.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6681818182 + ) + + def test_johnson_dist(self): + """Test abydos.distance.Johnson.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3318181818 + ) + + def test_johnson_sim_score(self): + """Test abydos.distance.Johnson.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 2.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 2.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 1.3363636364 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kendall_tau.py b/tests/distance/test_distance_kendall_tau.py new file mode 100644 index 000000000..15a9b336f --- /dev/null +++ b/tests/distance/test_distance_kendall_tau.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kendall_tau. + +This module contains unit tests for abydos.distance.KendallTau +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KendallTau + + +class KendallTauTestCases(unittest.TestCase): + """Test KendallTau functions. + + abydos.distance.KendallTau + """ + + cmp = KendallTau() + cmp_no_d = KendallTau(alphabet=0) + + def test_kendall_tau_sim(self): + """Test abydos.distance.KendallTau.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5012771392081737) + self.assertEqual(self.cmp.sim('a', ''), 0.5012706231918055) + self.assertEqual(self.cmp.sim('', 'a'), 0.5012706231918055) + self.assertEqual(self.cmp.sim('abc', ''), 0.5012641071754372) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5012641071754372) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5012771392081737) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5012445591263325) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5012575912) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5012575912) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5012575912) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5012575912) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5012543332 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.16666666666666669) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.16666666666666669) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.8333333333333333) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.3888888888888889) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4583333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4583333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4583333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4583333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_kendall_tau_dist(self): + """Test abydos.distance.KendallTau.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.49872286079182626) + self.assertEqual(self.cmp.dist('a', ''), 0.4987293768081945) + self.assertEqual(self.cmp.dist('', 'a'), 0.4987293768081945) + self.assertEqual(self.cmp.dist('abc', ''), 0.49873589282456277) + self.assertEqual(self.cmp.dist('', 'abc'), 0.49873589282456277) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.49872286079182626) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.4987554408736675) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4987424088) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4987424088) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4987424088) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4987424088) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4987456668 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.8333333333333333) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.8333333333333333) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.16666666666666674) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.6111111111111112 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5416666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5416666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5416666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5416666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_kendall_tau_corr(self): + """Test abydos.distance.KendallTau.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.002554278416347382) + self.assertEqual(self.cmp.corr('a', ''), 0.0025412463836109156) + self.assertEqual(self.cmp.corr('', 'a'), 0.0025412463836109156) + self.assertEqual(self.cmp.corr('abc', ''), 0.0025282143508744493) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0025282143508744493) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.002554278416347382) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), 0.0024891182526650506) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0025151823) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0025151823) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0025151823) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0025151823) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.0025086663 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -2.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -2.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -0.6666666666666666) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -0.6666666666666666) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.6666666666666666) + self.assertEqual( + self.cmp_no_d.corr('abcd', 'efgh'), -0.2222222222222222 + ) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.0833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.0833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.0833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.0833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kent_foster_i.py b/tests/distance/test_distance_kent_foster_i.py new file mode 100644 index 000000000..4b5f92aef --- /dev/null +++ b/tests/distance/test_distance_kent_foster_i.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kent_foster_i. + +This module contains unit tests for abydos.distance.KentFosterI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KentFosterI + + +class KentFosterITestCases(unittest.TestCase): + """Test KentFosterI functions. + + abydos.distance.KentFosterI + """ + + cmp = KentFosterI() + cmp_no_d = KentFosterI(alphabet=0) + + def test_kent_foster_i_sim(self): + """Test abydos.distance.KentFosterI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.6666666666666667) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.8) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.8) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8604651163 + ) + + def test_kent_foster_i_dist(self): + """Test abydos.distance.KentFosterI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.33333333333333326) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1395348837 + ) + + def test_kent_foster_i_sim_score(self): + """Test abydos.distance.KentFosterI.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), -0.3333333333333333 + ) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), -0.2) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), -0.2) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), -0.2) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), -0.2) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.1395348837 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kent_foster_ii.py b/tests/distance/test_distance_kent_foster_ii.py new file mode 100644 index 000000000..b59f8c517 --- /dev/null +++ b/tests/distance/test_distance_kent_foster_ii.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kent_foster_ii. + +This module contains unit tests for abydos.distance.KentFosterII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KentFosterII + + +class KentFosterIITestCases(unittest.TestCase): + """Test KentFosterII functions. + + abydos.distance.KentFosterII + """ + + cmp = KentFosterII() + cmp_no_d = KentFosterII(alphabet=0) + + def test_kent_foster_ii_sim(self): + """Test abydos.distance.KentFosterII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9968010236724241) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9980756895) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9980756895) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9980756895) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9980756895) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9977888336 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.6666666666666667) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.6756756757 + ) + + def test_kent_foster_ii_dist(self): + """Test abydos.distance.KentFosterII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.003198976327575931) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0019243105) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0019243105) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0019243105) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0019243105) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0022111664 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.33333333333333326 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.3243243243 + ) + + def test_kent_foster_ii_sim_score(self): + """Test abydos.distance.KentFosterII.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), -0.0031989763275758767 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), -0.0019243105 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), -0.0019243105 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), -0.0019243105 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), -0.0019243105 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.0022111664 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.sim_score('abcd', 'efgh'), -0.3333333333333333 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), -0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.3243243243 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_koppen_i.py b/tests/distance/test_distance_koppen_i.py new file mode 100644 index 000000000..2c5cf2b1f --- /dev/null +++ b/tests/distance/test_distance_koppen_i.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_koppen_i. + +This module contains unit tests for abydos.distance.KoppenI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KoppenI +from abydos.tokenizer import QSkipgrams + + +class KoppenITestCases(unittest.TestCase): + """Test KoppenI functions. + + abydos.distance.KoppenI + """ + + cmp = KoppenI() + cmp_no_d = KoppenI(alphabet=0) + + def test_koppen_i_sim(self): + """Test abydos.distance.KoppenI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.49936143039591324) + self.assertEqual(self.cmp.sim('', 'a'), 0.49936143039591324) + self.assertEqual(self.cmp.sim('abc', ''), 0.4987212276214834) + self.assertEqual(self.cmp.sim('', 'abc'), 0.4987212276214834) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.49679075738125805) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7471079692) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7471079692) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7471079692) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7471079692) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8295625943 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_koppen_i_dist(self): + """Test abydos.distance.KoppenI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5006385696040867) + self.assertEqual(self.cmp.dist('', 'a'), 0.5006385696040867) + self.assertEqual(self.cmp.dist('abc', ''), 0.5012787723785166) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5012787723785166) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2528920308) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2528920308) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2528920308) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2528920308) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1704374057 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_koppen_i_corr(self): + """Test abydos.distance.KoppenI.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -0.0012771392081735637) + self.assertEqual(self.cmp.corr('', 'a'), -0.0012771392081735637) + self.assertEqual(self.cmp.corr('abc', ''), -0.002557544757033164) + self.assertEqual(self.cmp.corr('', 'abc'), -0.002557544757033164) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483896) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4942159383) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4942159383) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4942159383) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4942159383) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6591251885 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + self.assertEqual( + KoppenI( + alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK') + ).corr('eh', 'a'), + 0.0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_koppen_ii.py b/tests/distance/test_distance_koppen_ii.py new file mode 100644 index 000000000..85df88bb4 --- /dev/null +++ b/tests/distance/test_distance_koppen_ii.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_koppen_ii. + +This module contains unit tests for abydos.distance.KoppenII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KoppenII + + +class KoppenIITestCases(unittest.TestCase): + """Test KoppenII functions. + + abydos.distance.KoppenII + """ + + cmp = KoppenII() + cmp_no_d = KoppenII(alphabet=0) + + def test_koppen_ii_sim(self): + """Test abydos.distance.KoppenII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.75) + + def test_koppen_ii_dist(self): + """Test abydos.distance.KoppenII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.25) + + def test_koppen_ii_sim_score(self): + """Test abydos.distance.KoppenII.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 1.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 1.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 2.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 2.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 4.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 5.0) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 6.0) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 6.0) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 6.0) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 6.0) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 10.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuder_richardson.py b/tests/distance/test_distance_kuder_richardson.py new file mode 100644 index 000000000..808383f8a --- /dev/null +++ b/tests/distance/test_distance_kuder_richardson.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuder_richardson. + +This module contains unit tests for abydos.distance.KuderRichardson +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuderRichardson + + +class KuderRichardsonTestCases(unittest.TestCase): + """Test KuderRichardson functions. + + abydos.distance.KuderRichardson + """ + + cmp = KuderRichardson() + cmp_no_d = KuderRichardson(alphabet=0) + + def test_kuder_richardson_sim(self): + """Test abydos.distance.KuderRichardson.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4935400516795866) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.8316151203) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.8316151203) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8316151203) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8316151203) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8983851254 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0102040816 + ) + + def test_kuder_richardson_dist(self): + """Test abydos.distance.KuderRichardson.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5064599483204134) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.1683848797) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.1683848797) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1683848797) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1683848797) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1016148746 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.9897959184 + ) + + def test_kuder_richardson_corr(self): + """Test abydos.distance.KuderRichardson.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.012919896640826873) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.6632302405) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.6632302405) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.6632302405) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.6632302405) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.7967702508 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), float('-inf')) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -2.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -2.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -2.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -2.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.9795918367 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_i.py b/tests/distance/test_distance_kuhns_i.py new file mode 100644 index 000000000..90189dbe0 --- /dev/null +++ b/tests/distance/test_distance_kuhns_i.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_i. + +This module contains unit tests for abydos.distance.KuhnsI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsI + + +class KuhnsITestCases(unittest.TestCase): + """Test KuhnsI functions. + + abydos.distance.KuhnsI + """ + + cmp = KuhnsI() + cmp_no_d = KuhnsI(alphabet=0) + + def test_kuhns_i_sim(self): + """Test abydos.distance.KuhnsI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5101520199916701) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.49991865368596416) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5075359225) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5075359225) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5075359225) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5075359225) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5174992191 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3775510204 + ) + + def test_kuhns_i_dist(self): + """Test abydos.distance.KuhnsI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.48984798000832985) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5000813463140359) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4924640775) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4924640775) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4924640775) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4924640775) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4825007809 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6224489796 + ) + + def test_kuhns_i_corr(self): + """Test abydos.distance.KuhnsI.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.010152019991670138) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -8.134631403581842e-05) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0075359225) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0075359225) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0075359225) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0075359225) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.0174992191 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.5) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.1224489796 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_ii.py b/tests/distance/test_distance_kuhns_ii.py new file mode 100644 index 000000000..f72af1f10 --- /dev/null +++ b/tests/distance/test_distance_kuhns_ii.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_ii. + +This module contains unit tests for abydos.distance.KuhnsII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsII + + +class KuhnsIITestCases(unittest.TestCase): + """Test KuhnsII functions. + + abydos.distance.KuhnsII + """ + + cmp = KuhnsII() + cmp_no_d = KuhnsII(alphabet=0) + + def test_kuhns_ii_sim(self): + """Test abydos.distance.KuhnsII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('a', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('', 'a'), 0.3333333333333333) + self.assertEqual(self.cmp.sim('abc', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.9965986394557823) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.32908163265306123) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6615646259) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7490723562 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2813852814 + ) + + def test_kuhns_ii_dist(self): + """Test abydos.distance.KuhnsII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('a', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('', 'a'), 0.6666666666666667) + self.assertEqual(self.cmp.dist('abc', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.003401360544217691) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6709183673469388) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3384353741) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2509276438 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7186147186 + ) + + def test_kuhns_ii_corr(self): + """Test abydos.distance.KuhnsII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006377551020408163) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4923469388) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6236085343 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.5) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.0779220779 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_iii.py b/tests/distance/test_distance_kuhns_iii.py new file mode 100644 index 000000000..b154de235 --- /dev/null +++ b/tests/distance/test_distance_kuhns_iii.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_iii. + +This module contains unit tests for abydos.distance.KuhnsIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsIII + + +class KuhnsIIITestCases(unittest.TestCase): + """Test KuhnsIII functions. + + abydos.distance.KuhnsIII + """ + + cmp = KuhnsIII() + cmp_no_d = KuhnsIII(alphabet=0) + + def test_kuhns_iii_sim(self): + """Test abydos.distance.KuhnsIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.25) + self.assertEqual(self.cmp.sim('a', ''), 0.25) + self.assertEqual(self.cmp.sim('', 'a'), 0.25) + self.assertEqual(self.cmp.sim('abc', ''), 0.25) + self.assertEqual(self.cmp.sim('', 'abc'), 0.25) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.9980818414322251) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.24760076775431863) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4971190781) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4971190781) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4971190781) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4971190781) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6199553626 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.25) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.25) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.25) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.25) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.25) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.25) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.125) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.125) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.125) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.125) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1766304348 + ) + + def test_kuhns_iii_dist(self): + """Test abydos.distance.KuhnsIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.75) + self.assertEqual(self.cmp.dist('a', ''), 0.75) + self.assertEqual(self.cmp.dist('', 'a'), 0.75) + self.assertEqual(self.cmp.dist('abc', ''), 0.75) + self.assertEqual(self.cmp.dist('', 'abc'), 0.75) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0019181585677748858) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.7523992322456814) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5028809219) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5028809219) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5028809219) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5028809219) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3800446374 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.75) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.75) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.75) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.75) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.75) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.75) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.875) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.875) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.875) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.875) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8233695652 + ) + + def test_kuhns_iii_corr(self): + """Test abydos.distance.KuhnsIII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.9974424552429668) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.003198976327575176) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.3294921041) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.3294921041) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.3294921041) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.3294921041) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.4932738168 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.corr('abcd', 'efgh'), -0.3333333333333333 + ) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.097826087 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_iv.py b/tests/distance/test_distance_kuhns_iv.py new file mode 100644 index 000000000..c417c8a61 --- /dev/null +++ b/tests/distance/test_distance_kuhns_iv.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_iv. + +This module contains unit tests for abydos.distance.KuhnsIV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsIV + + +class KuhnsIVTestCases(unittest.TestCase): + """Test KuhnsIV functions. + + abydos.distance.KuhnsIV + """ + + cmp = KuhnsIV() + cmp_no_d = KuhnsIV(alphabet=0) + + def test_kuhns_iv_sim(self): + """Test abydos.distance.KuhnsIV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.9974489795918368) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4968112244897959) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7461734694) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7461734694) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7461734694) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7461734694) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8429846939 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.25) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4166666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4166666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4166666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4166666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4571428571 + ) + + def test_kuhns_iv_dist(self): + """Test abydos.distance.KuhnsIV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0025510204081632404) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503188775510204) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2538265306) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2538265306) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2538265306) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2538265306) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1570153061 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.75) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5833333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5428571429 + ) + + def test_kuhns_iv_corr(self): + """Test abydos.distance.KuhnsIV.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006377551020408163) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4923469388) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6859693878 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.5) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.0857142857 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_ix.py b/tests/distance/test_distance_kuhns_ix.py new file mode 100644 index 000000000..1d5f53794 --- /dev/null +++ b/tests/distance/test_distance_kuhns_ix.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_ix. + +This module contains unit tests for abydos.distance.KuhnsIX +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsIX + + +class KuhnsIXTestCases(unittest.TestCase): + """Test KuhnsIX functions. + + abydos.distance.KuhnsIX + """ + + cmp = KuhnsIX() + cmp_no_d = KuhnsIX(alphabet=0) + + def test_kuhns_ix_sim(self): + """Test abydos.distance.KuhnsIX.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8314623708 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3348554352 + ) + + def test_kuhns_ix_dist(self): + """Test abydos.distance.KuhnsIX.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertAlmostEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242619) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1685376292 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6651445648 + ) + + def test_kuhns_ix_corr(self): + """Test abydos.distance.KuhnsIX.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertAlmostEqual( + self.cmp.corr('abcd', 'efgh'), -0.006418485237483954 + ) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6629247416 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3302891295 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_v.py b/tests/distance/test_distance_kuhns_v.py new file mode 100644 index 000000000..9eda339e6 --- /dev/null +++ b/tests/distance/test_distance_kuhns_v.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_v. + +This module contains unit tests for abydos.distance.KuhnsV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsV + + +class KuhnsVTestCases(unittest.TestCase): + """Test KuhnsV functions. + + abydos.distance.KuhnsV + """ + + cmp = KuhnsV() + cmp_no_d = KuhnsV(alphabet=0) + + def test_kuhns_v_sim(self): + """Test abydos.distance.KuhnsV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8162413266 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.35 + ) + + def test_kuhns_v_dist(self): + """Test abydos.distance.KuhnsV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp.dist('abcd', 'efgh'), 0.503209242618742 + ) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1837586734 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.65 + ) + + def test_kuhns_v_corr(self): + """Test abydos.distance.KuhnsV.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertAlmostEqual( + self.cmp.corr('abcd', 'efgh'), -0.006418485237484 + ) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6324826532 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_vi.py b/tests/distance/test_distance_kuhns_vi.py new file mode 100644 index 000000000..f69eac130 --- /dev/null +++ b/tests/distance/test_distance_kuhns_vi.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_vi. + +This module contains unit tests for abydos.distance.KuhnsVI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsVI + + +class KuhnsVITestCases(unittest.TestCase): + """Test KuhnsVI functions. + + abydos.distance.KuhnsVI + """ + + cmp = KuhnsVI() + cmp_no_d = KuhnsVI(alphabet=0) + + def test_kuhns_vi_sim(self): + """Test abydos.distance.KuhnsVI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8474160207 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3181818182 + ) + + def test_kuhns_vi_dist(self): + """Test abydos.distance.KuhnsVI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp.dist('abcd', 'efgh'), 0.503209242618742 + ) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1525839793 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6818181818 + ) + + def test_kuhns_vi_corr(self): + """Test abydos.distance.KuhnsVI.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertAlmostEqual( + self.cmp.corr('abcd', 'efgh'), -0.006418485237484 + ) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6948320413 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3636363636 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_vii.py b/tests/distance/test_distance_kuhns_vii.py new file mode 100644 index 000000000..91fe68215 --- /dev/null +++ b/tests/distance/test_distance_kuhns_vii.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_vii. + +This module contains unit tests for abydos.distance.KuhnsVII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsVII + + +class KuhnsVIITestCases(unittest.TestCase): + """Test KuhnsVII functions. + + abydos.distance.KuhnsVII + """ + + cmp = KuhnsVII() + cmp_no_d = KuhnsVII(alphabet=0) + + def test_kuhns_vii_sim(self): + """Test abydos.distance.KuhnsVII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('a', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('', 'a'), 0.3333333333333333) + self.assertEqual(self.cmp.sim('abc', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.9965986394557823) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.32908163265306123) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6615646259) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7693640991 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2788497568 + ) + + def test_kuhns_vii_dist(self): + """Test abydos.distance.KuhnsVII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('a', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('', 'a'), 0.6666666666666667) + self.assertEqual(self.cmp.dist('abc', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.003401360544217691) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6709183673469388) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3384353741) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2306359009 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7211502432 + ) + + def test_kuhns_vii_corr(self): + """Test abydos.distance.KuhnsVII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006377551020408163) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4923469388) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6540461486 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.5) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.0817253648 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_viii.py b/tests/distance/test_distance_kuhns_viii.py new file mode 100644 index 000000000..88d8812a0 --- /dev/null +++ b/tests/distance/test_distance_kuhns_viii.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_viii. + +This module contains unit tests for abydos.distance.KuhnsVIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsVIII + + +class KuhnsVIIITestCases(unittest.TestCase): + """Test KuhnsVIII functions. + + abydos.distance.KuhnsVIII + """ + + cmp = KuhnsVIII() + cmp_no_d = KuhnsVIII(alphabet=0) + + def test_kuhns_viii_sim(self): + """Test abydos.distance.KuhnsVIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('a', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('', 'a'), 0.3333333333333333) + self.assertEqual(self.cmp.sim('abc', ''), 0.3333333333333333) + self.assertEqual(self.cmp.sim('', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.9965986394557823) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.32908163265306123) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6615646259) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6615646259) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7688694525 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.3333333333333333) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2789115646 + ) + + def test_kuhns_viii_dist(self): + """Test abydos.distance.KuhnsVIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('a', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('', 'a'), 0.6666666666666667) + self.assertEqual(self.cmp.dist('abc', ''), 0.6666666666666667) + self.assertEqual(self.cmp.dist('', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.003401360544217691) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6709183673469388) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3384353741) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3384353741) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2311305475 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.6666666666666667) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7210884354 + ) + + def test_kuhns_viii_corr(self): + """Test abydos.distance.KuhnsVIII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006377551020408163) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4923469388) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4923469388) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6533041788 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.5) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.0816326531 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_x.py b/tests/distance/test_distance_kuhns_x.py new file mode 100644 index 000000000..87ba4050e --- /dev/null +++ b/tests/distance/test_distance_kuhns_x.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_x. + +This module contains unit tests for abydos.distance.KuhnsX +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsX + + +class KuhnsXTestCases(unittest.TestCase): + """Test KuhnsX functions. + + abydos.distance.KuhnsX + """ + + cmp = KuhnsX() + cmp_no_d = KuhnsX(alphabet=0) + + def test_kuhns_x_sim(self): + """Test abydos.distance.KuhnsX.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9977786005 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.125) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.125) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.125) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.125) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.18421052631 + ) + + def test_kuhns_x_dist(self): + """Test abydos.distance.KuhnsX.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038560411) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0022213995 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.875) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.875) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.875) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.875) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.81578947368 + ) + + def test_kuhns_x_corr(self): + """Test abydos.distance.KuhnsX.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9922879177) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.995557201 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.75) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.75) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.75) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.75) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.6315789474 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_xi.py b/tests/distance/test_distance_kuhns_xi.py new file mode 100644 index 000000000..ec261cd1f --- /dev/null +++ b/tests/distance/test_distance_kuhns_xi.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_xi. + +This module contains unit tests for abydos.distance.KuhnsXI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsXI + + +class KuhnsXITestCases(unittest.TestCase): + """Test KuhnsXI functions. + + abydos.distance.KuhnsXI + """ + + cmp = KuhnsXI() + cmp_no_d = KuhnsXI(alphabet=0) + + def test_kuhns_xi_sim(self): + """Test abydos.distance.KuhnsXI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9414271324) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9414271324) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9414271324) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9414271324) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9549418688 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_kuhns_xi_dist(self): + """Test abydos.distance.KuhnsXI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0585728676) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0585728676) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0585728676) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0585728676) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0450581312 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_kuhns_xi_corr(self): + """Test abydos.distance.KuhnsXI.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.8828542648) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.8828542648) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.8828542648) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.8828542648) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9098837375 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kuhns_xii.py b/tests/distance/test_distance_kuhns_xii.py new file mode 100644 index 000000000..787981b87 --- /dev/null +++ b/tests/distance/test_distance_kuhns_xii.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kuhns_xii. + +This module contains unit tests for abydos.distance.KuhnsXII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KuhnsXII + + +class KuhnsXIITestCases(unittest.TestCase): + """Test KuhnsXII functions. + + abydos.distance.KuhnsXII + """ + + cmp = KuhnsXII() + cmp_no_d = KuhnsXII(alphabet=0) + + def test_kuhns_xii_sim(self): + """Test abydos.distance.KuhnsXII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2490322581) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2490322581) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2490322581) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2490322581) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4444628099 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.375) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.375) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.375) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.375) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4454545455 + ) + + def test_kuhns_xii_dist(self): + """Test abydos.distance.KuhnsXII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7509677419) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7509677419) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7509677419) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7509677419) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5555371901 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.625) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.625) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.625) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.625) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5545454545 + ) + + def test_kuhns_xii_sim_score(self): + """Test abydos.distance.KuhnsXII.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 195.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 48.8909090909 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), -0.25 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), -0.25 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), -0.25 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), -0.25 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.1090909091 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kulczynski_i.py b/tests/distance/test_distance_kulczynski_i.py new file mode 100644 index 000000000..cd98a8602 --- /dev/null +++ b/tests/distance/test_distance_kulczynski_i.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kulczynski_i. + +This module contains unit tests for abydos.distance.KulczynskiI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KulczynskiI + + +class KulczynskiITestCases(unittest.TestCase): + """Test KulczynskiI functions. + + abydos.distance.KulczynskiI + """ + + cmp = KulczynskiI() + + def test_kulczynski_i_sim_score(self): + """Test abydos.distance.KulczynskiI.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), float('inf')) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_kulczynski_i_dist(self): + """Test abydos.distance.KulczynskiI.dist.""" + self.assertRaises(NotImplementedError, self.cmp.dist) + + def test_kulczynski_i_sim(self): + """Test abydos.distance.KulczynskiI.sim.""" + self.assertRaises(NotImplementedError, self.cmp.sim) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_kulczynski_ii.py b/tests/distance/test_distance_kulczynski_ii.py new file mode 100644 index 000000000..d676b1897 --- /dev/null +++ b/tests/distance/test_distance_kulczynski_ii.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_kulczynski_ii. + +This module contains unit tests for abydos.distance.KulczynskiII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import KulczynskiII + + +class KulczynskiIITestCases(unittest.TestCase): + """Test KulczynskiII functions. + + abydos.distance.KulczynskiII + """ + + cmp = KulczynskiII() + + def test_kulczynski_ii_sim(self): + """Test abydos.distance.KulczynskiII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6681818182 + ) + + def test_kulczynski_ii_dist(self): + """Test abydos.distance.KulczynskiII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3318181818 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_lcprefix.py b/tests/distance/test_distance_lcprefix.py new file mode 100644 index 000000000..e91b42476 --- /dev/null +++ b/tests/distance/test_distance_lcprefix.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_lcprefix. + +This module contains unit tests for abydos.distance.LCPrefix +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import LCPrefix + + +class LCPrefixTestCases(unittest.TestCase): + """Test LCPrefix functions. + + abydos.distance.LCPrefix + """ + + cmp = LCPrefix() + + def test_lcprefix_sim(self): + """Test abydos.distance.LCPrefix.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.1) + + def test_lcprefix_dist(self): + """Test abydos.distance.LCPrefix.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.9) + + def test_lcprefix_dist_abs(self): + """Test abydos.distance.LCPrefix.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 3) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 0) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 1) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall', 'Niel'), 2) + with self.assertRaises(TypeError): + self.cmp.dist_abs('Nigel', 'Niall', 5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_lcsuffix.py b/tests/distance/test_distance_lcsuffix.py new file mode 100644 index 000000000..6c1fadfad --- /dev/null +++ b/tests/distance/test_distance_lcsuffix.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_lcsuffix. + +This module contains unit tests for abydos.distance.LCSuffix +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import LCSuffix + + +class LCSuffixTestCases(unittest.TestCase): + """Test LCSuffix functions. + + abydos.distance.LCSuffix + """ + + cmp = LCSuffix() + + def test_lcsuffix_sim(self): + """Test abydos.distance.LCSuffix.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0) + + def test_lcsuffix_dist(self): + """Test abydos.distance.LCSuffix.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 1.0) + + def test_lcsuffix_dist_abs(self): + """Test abydos.distance.LCSuffix.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 3) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 0) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall', 'Niel'), 1) + with self.assertRaises(TypeError): + self.cmp.dist_abs('Nigel', 'Niall', 5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_length.py b/tests/distance/test_distance_length.py index c76b18d25..f0e16249a 100644 --- a/tests/distance/test_distance_length.py +++ b/tests/distance/test_distance_length.py @@ -36,12 +36,12 @@ class LengthTestCases(unittest.TestCase): """Test length similarity functions. - abydos.distance._basic.Length + abydos.distance.Length """ cmp = Length() - def test_ident_sim(self): + def test_length_sim(self): """Test abydos.distance.Length.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('', 'a'), 0) @@ -58,7 +58,7 @@ def test_ident_sim(self): # Test wrapper self.assertEqual(sim_length('abcd', 'cba'), 0.75) - def test_ident_dist(self): + def test_length_dist(self): """Test abydos.distance.Length.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('', 'a'), 1) diff --git a/tests/distance/test_distance_levenshtein.py b/tests/distance/test_distance_levenshtein.py index b89639171..2a49d7208 100644 --- a/tests/distance/test_distance_levenshtein.py +++ b/tests/distance/test_distance_levenshtein.py @@ -45,6 +45,7 @@ class LevenshteinTestCases(unittest.TestCase): """ cmp = Levenshtein() + cmp_taper = Levenshtein(taper=True) def test_levenshtein_dist_abs(self): """Test abydos.distance.Levenshtein.dist_abs.""" @@ -84,68 +85,100 @@ def test_levenshtein_dist_abs(self): ) # https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance - self.assertEqual(self.cmp.dist_abs('CA', 'ABC', 'osa'), 3) + self.assertEqual(Levenshtein(mode='osa').dist_abs('CA', 'ABC'), 3) # test cost of insert self.assertEqual( - self.cmp.dist_abs('', 'b', 'lev', cost=(5, 7, 10, 10)), 5 + Levenshtein(mode='lev', cost=(5, 7, 10, 10)).dist_abs('', 'b'), 5 ) self.assertEqual( - self.cmp.dist_abs('', 'b', 'osa', cost=(5, 7, 10, 10)), 5 + Levenshtein(mode='osa', cost=(5, 7, 10, 10)).dist_abs('', 'b'), 5 ) self.assertEqual( - self.cmp.dist_abs('a', 'ab', 'lev', cost=(5, 7, 10, 10)), 5 + Levenshtein(mode='lev', cost=(5, 7, 10, 10)).dist_abs('a', 'ab'), 5 ) self.assertEqual( - self.cmp.dist_abs('a', 'ab', 'osa', cost=(5, 7, 10, 10)), 5 + Levenshtein(mode='osa', cost=(5, 7, 10, 10)).dist_abs('a', 'ab'), 5 ) # test cost of delete self.assertEqual( - self.cmp.dist_abs('b', '', 'lev', cost=(5, 7, 10, 10)), 7 + Levenshtein(mode='lev', cost=(5, 7, 10, 10)).dist_abs('b', ''), 7 ) self.assertEqual( - self.cmp.dist_abs('b', '', 'osa', cost=(5, 7, 10, 10)), 7 + Levenshtein(mode='osa', cost=(5, 7, 10, 10)).dist_abs('b', ''), 7 ) self.assertEqual( - self.cmp.dist_abs('ab', 'a', 'lev', cost=(5, 7, 10, 10)), 7 + Levenshtein(mode='lev', cost=(5, 7, 10, 10)).dist_abs('ab', 'a'), 7 ) self.assertEqual( - self.cmp.dist_abs('ab', 'a', 'osa', cost=(5, 7, 10, 10)), 7 + Levenshtein(mode='osa', cost=(5, 7, 10, 10)).dist_abs('ab', 'a'), 7 ) # test cost of substitute self.assertEqual( - self.cmp.dist_abs('a', 'b', 'lev', cost=(10, 10, 5, 10)), 5 + Levenshtein(mode='lev', cost=(10, 10, 5, 10)).dist_abs('a', 'b'), 5 ) self.assertEqual( - self.cmp.dist_abs('a', 'b', 'osa', cost=(10, 10, 5, 10)), 5 + Levenshtein(mode='osa', cost=(10, 10, 5, 10)).dist_abs('a', 'b'), 5 ) self.assertEqual( - self.cmp.dist_abs('ac', 'bc', 'lev', cost=(10, 10, 5, 10)), 5 + Levenshtein(mode='lev', cost=(10, 10, 5, 10)).dist_abs('ac', 'bc'), + 5, ) self.assertEqual( - self.cmp.dist_abs('ac', 'bc', 'osa', cost=(10, 10, 5, 10)), 5 + Levenshtein(mode='osa', cost=(10, 10, 5, 10)).dist_abs('ac', 'bc'), + 5, ) # test cost of transpose self.assertEqual( - self.cmp.dist_abs('ab', 'ba', 'lev', cost=(10, 10, 10, 5)), 20 + Levenshtein(mode='lev', cost=(10, 10, 10, 5)).dist_abs('ab', 'ba'), + 20, ) self.assertEqual( - self.cmp.dist_abs('ab', 'ba', 'osa', cost=(10, 10, 10, 5)), 5 + Levenshtein(mode='osa', cost=(10, 10, 10, 5)).dist_abs('ab', 'ba'), + 5, ) self.assertEqual( - self.cmp.dist_abs('abc', 'bac', 'lev', cost=(10, 10, 10, 5)), 20 + Levenshtein(mode='lev', cost=(10, 10, 10, 5)).dist_abs( + 'abc', 'bac' + ), + 20, ) self.assertEqual( - self.cmp.dist_abs('abc', 'bac', 'osa', cost=(10, 10, 10, 5)), 5 + Levenshtein(mode='osa', cost=(10, 10, 10, 5)).dist_abs( + 'abc', 'bac' + ), + 5, ) self.assertEqual( - self.cmp.dist_abs('cab', 'cba', 'lev', cost=(10, 10, 10, 5)), 20 + Levenshtein(mode='lev', cost=(10, 10, 10, 5)).dist_abs( + 'cab', 'cba' + ), + 20, ) self.assertEqual( - self.cmp.dist_abs('cab', 'cba', 'osa', cost=(10, 10, 10, 5)), 5 + Levenshtein(mode='osa', cost=(10, 10, 10, 5)).dist_abs( + 'cab', 'cba' + ), + 5, + ) + + # tapered variant + self.assertAlmostEqual( + self.cmp_taper.dist_abs('abc', 'ac'), 1.33333333333 + ) + self.assertAlmostEqual( + self.cmp_taper.dist_abs('xabxcdxxefxgx', 'abcdefg'), + 8.615384615384617, + ) + self.assertAlmostEqual( + self.cmp_taper.dist_abs('levenshtein', 'frankenstein'), 10 + ) + self.assertAlmostEqual( + self.cmp_taper.dist_abs('distance', 'difference'), + 7.499999999999999, ) # Test wrapper @@ -170,6 +203,17 @@ def test_levenshtein_dist(self): self.assertAlmostEqual(self.cmp.dist('abbc', 'ac'), 1 / 2) self.assertAlmostEqual(self.cmp.dist('abbc', 'abc'), 1 / 4) + # tapered variant + self.assertAlmostEqual( + self.cmp_taper.dist('abc', 'ac'), 0.2666666666666666 + ) + self.assertAlmostEqual( + self.cmp_taper.dist('abbc', 'ac'), 0.4230769230769231 + ) + self.assertAlmostEqual( + self.cmp_taper.dist('abbc', 'abc'), 0.19230769230769232 + ) + # Test wrapper self.assertAlmostEqual(dist_levenshtein('abbc', 'abc'), 1 / 4) diff --git a/tests/distance/test_distance_lorentzian.py b/tests/distance/test_distance_lorentzian.py new file mode 100644 index 000000000..e7ad27674 --- /dev/null +++ b/tests/distance/test_distance_lorentzian.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_lorentzian. + +This module contains unit tests for abydos.distance.Lorentzian +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Lorentzian + + +class LorentzianTestCases(unittest.TestCase): + """Test Lorentzian functions. + + abydos.distance.Lorentzian + """ + + cmp = Lorentzian() + + def test_lorentzian_dist(self): + """Test abydos.distance.Lorentzian.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_lorentzian_sim(self): + """Test abydos.distance.Lorentzian.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_lorentzian_dist_abs(self): + """Test abydos.distance.Lorentzian.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.3862943611198906) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.3862943611198906) + self.assertEqual(self.cmp.dist_abs('abc', ''), 2.772588722239781) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 2.772588722239781) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 6.931471805599453) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 4.1588830834 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 4.1588830834 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 4.1588830834 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 4.1588830834 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4.8520302639 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_maarel.py b/tests/distance/test_distance_maarel.py new file mode 100644 index 000000000..8e09741b1 --- /dev/null +++ b/tests/distance/test_distance_maarel.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_maarel. + +This module contains unit tests for abydos.distance.Maarel +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Maarel + + +class MaarelTestCases(unittest.TestCase): + """Test Maarel functions. + + abydos.distance.Maarel + """ + + cmp = Maarel() + cmp_no_d = Maarel(alphabet=0) + + def test_maarel_sim(self): + """Test abydos.distance.Maarel.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + def test_maarel_dist(self): + """Test abydos.distance.Maarel.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + def test_maarel_corr(self): + """Test abydos.distance.Maarel.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_manhattan.py b/tests/distance/test_distance_manhattan.py index 6b4728693..f74a9db15 100644 --- a/tests/distance/test_distance_manhattan.py +++ b/tests/distance/test_distance_manhattan.py @@ -31,7 +31,7 @@ import unittest from abydos.distance import Manhattan, dist_manhattan, manhattan, sim_manhattan -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -43,6 +43,8 @@ class ManhattanTestCases(unittest.TestCase): """ cmp = Manhattan() + cmp_q2 = Manhattan(tokenizer=QGrams(2)) + cmp_ws = Manhattan(tokenizer=WhitespaceTokenizer()) def test_manhattan_dist_abs(self): """Test abydos.distance.Manhattan.dist_abs.""" @@ -51,25 +53,47 @@ def test_manhattan_dist_abs(self): self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8) self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) - self.assertEqual(self.cmp.dist_abs('', '', 2), 0) - self.assertEqual(self.cmp.dist_abs('nelson', '', 2), 7) - self.assertEqual(self.cmp.dist_abs('', 'neilsen', 2), 8) - self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen', 2), 7) + self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) + self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7) + self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8) + self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7) # supplied q-gram tests - self.assertEqual(self.cmp.dist_abs(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist_abs(QGrams('nelson'), QGrams('')), 7) - self.assertEqual(self.cmp.dist_abs(QGrams(''), QGrams('neilsen')), 8) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 7, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 8, + ) self.assertAlmostEqual( - self.cmp.dist_abs(QGrams('nelson'), QGrams('neilsen')), 7 + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7, ) # non-q-gram tests - self.assertEqual(self.cmp.dist_abs('', '', 0), 0) - self.assertEqual(self.cmp.dist_abs('the quick', '', 0), 2) - self.assertEqual(self.cmp.dist_abs('', 'the quick', 0), 2) - self.assertAlmostEqual(self.cmp.dist_abs(NONQ_FROM, NONQ_TO, 0), 8) - self.assertAlmostEqual(self.cmp.dist_abs(NONQ_TO, NONQ_FROM, 0), 8) + self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) + self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2) + self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2) + self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8) + self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8) # Test wrapper self.assertAlmostEqual(manhattan('nelson', 'neilsen'), 7) @@ -81,25 +105,47 @@ def test_manhattan_sim(self): self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) - self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 8 / 15) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) + self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 8 / 15 + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 8 / 15, ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) - self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 / 2) - self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 / 2) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_manhattan('nelson', 'neilsen'), 8 / 15) @@ -111,25 +157,47 @@ def test_manhattan_dist(self): self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) - self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 2), 7 / 15) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) + self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 7 / 15 + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7 / 15, ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) - self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 1 / 2) - self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 1 / 2) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_manhattan('nelson', 'neilsen'), 7 / 15) diff --git a/tests/distance/test_distance_marking.py b/tests/distance/test_distance_marking.py new file mode 100644 index 000000000..fe5c24e26 --- /dev/null +++ b/tests/distance/test_distance_marking.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_marking. + +This module contains unit tests for abydos.distance.Marking +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Marking + + +class MarkingTestCases(unittest.TestCase): + """Test Marking functions. + + abydos.distance.Marking + """ + + cmp = Marking() + + def test_marking_dist(self): + """Test abydos.distance.Marking.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2) + + def test_marking_sim(self): + """Test abydos.distance.Marking.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8) + + def test_marking_dist_abs(self): + """Test abydos.distance.Marking.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1) + self.assertEqual(self.cmp.dist_abs('', 'a'), 0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 3) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2) + + # Examples from paper + self.assertEqual(self.cmp.dist_abs('cbaabdcb', 'abcba'), 2) + self.assertEqual(self.cmp.dist_abs('abba', 'a'), 2) + self.assertEqual(self.cmp.dist_abs('baab', 'a'), 3) + # The following are from the example on p. 196 of the paper, but are + # there given in reverse order. + self.assertEqual(self.cmp.dist_abs('ab', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abcabcabcab', 'ab'), 3) + self.assertEqual(self.cmp.dist_abs('abcabcabcab', 'a'), 7) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_marking_metric.py b/tests/distance/test_distance_marking_metric.py new file mode 100644 index 000000000..f19cfe8ee --- /dev/null +++ b/tests/distance/test_distance_marking_metric.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_marking_metric. + +This module contains unit tests for abydos.distance.MarkingMetric +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from math import log + +from abydos.distance import MarkingMetric + + +class MarkingMetricTestCases(unittest.TestCase): + """Test MarkingMetric functions. + + abydos.distance.MarkingMetric + """ + + cmp = MarkingMetric() + + def test_marking_metric_dist(self): + """Test abydos.distance.MarkingMetric.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6131471928) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6131471928) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6131471928) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6131471928) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4674468153 + ) + + def test_marking_metric_sim(self): + """Test abydos.distance.MarkingMetric.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3868528072) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3868528072) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3868528072) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3868528072) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5325531847 + ) + + def test_marking_metric_dist_abs(self): + """Test abydos.distance.MarkingMetric.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 2.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 2.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4.643856189774724) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 3.1699250014 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 3.1699250014 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 3.1699250014 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 3.1699250014 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 3.1699250014 + ) + + # Examples from paper + self.assertEqual(self.cmp.dist_abs('abba', 'a'), log(3, 2)) + self.assertEqual(self.cmp.dist_abs('baab', 'a'), 2.0) + # The following are from the example on p. 196 of the paper, but are + # there given in reverse order. + self.assertEqual(self.cmp.dist_abs('ab', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abcabcabcab', 'ab'), 2) + self.assertEqual(self.cmp.dist_abs('abcabcabcab', 'a'), 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_masi.py b/tests/distance/test_distance_masi.py new file mode 100644 index 000000000..9875e1e6f --- /dev/null +++ b/tests/distance/test_distance_masi.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_masi. + +This module contains unit tests for abydos.distance.MASI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import MASI + + +class MASITestCases(unittest.TestCase): + """Test MASI functions. + + abydos.distance.MASI + """ + + cmp = MASI() + + def test_masi_sim(self): + """Test abydos.distance.MASI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1111111111) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1111111111) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1111111111) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1111111111) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.1666666667 + ) + + def test_masi_dist(self): + """Test abydos.distance.MASI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8888888889) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8888888889) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8888888889) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8888888889) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.8333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_matusita.py b/tests/distance/test_distance_matusita.py new file mode 100644 index 000000000..4b6c13d53 --- /dev/null +++ b/tests/distance/test_distance_matusita.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_matusita. + +This module contains unit tests for abydos.distance.Matusita +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Matusita + + +class MatusitaTestCases(unittest.TestCase): + """Test Matusita functions. + + abydos.distance.Matusita + """ + + cmp = Matusita() + + def test_matusita_dist(self): + """Test abydos.distance.Matusita.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.707106781186547) + self.assertEqual(self.cmp.dist('', 'a'), 0.707106781186547) + self.assertEqual(self.cmp.dist('abc', ''), 0.707106781186547) + self.assertEqual(self.cmp.dist('', 'abc'), 0.707106781186547) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7071067812) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7071067812) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7071067812) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7071067812) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5766941889 + ) + + def test_matusita_sim(self): + """Test abydos.distance.Matusita.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.292893218813453) + self.assertEqual(self.cmp.sim('', 'a'), 0.292893218813453) + self.assertEqual(self.cmp.sim('abc', ''), 0.292893218813453) + self.assertEqual(self.cmp.sim('', 'abc'), 0.292893218813453) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2928932188) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2928932188) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2928932188) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2928932188) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4233058111 + ) + + def test_matusita_dist_abs(self): + """Test abydos.distance.Matusita.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 1.414213562373095) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.8155687433 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_maxwell_pilliner.py b/tests/distance/test_distance_maxwell_pilliner.py new file mode 100644 index 000000000..adb01a4a9 --- /dev/null +++ b/tests/distance/test_distance_maxwell_pilliner.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_maxwell_pilliner. + +This module contains unit tests for abydos.distance.MaxwellPilliner +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import MaxwellPilliner + + +class MaxwellPillinerTestCases(unittest.TestCase): + """Test MaxwellPilliner functions. + + abydos.distance.MaxwellPilliner + """ + + cmp = MaxwellPilliner() + cmp_no_d = MaxwellPilliner(alphabet=0) + + def test_maxwell_pilliner_sim(self): + """Test abydos.distance.MaxwellPilliner.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8310964723 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3356164384 + ) + + def test_maxwell_pilliner_dist(self): + """Test abydos.distance.MaxwellPilliner.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1689035277 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6643835616 + ) + + def test_maxwell_pilliner_corr(self): + """Test abydos.distance.MaxwellPilliner.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6621929447 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3287671233 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_mcconnaughey.py b/tests/distance/test_distance_mcconnaughey.py new file mode 100644 index 000000000..bcdbbc243 --- /dev/null +++ b/tests/distance/test_distance_mcconnaughey.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_mcconnaughey. + +This module contains unit tests for abydos.distance.McConnaughey +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import McConnaughey + + +class McConnaugheyTestCases(unittest.TestCase): + """Test McConnaughey functions. + + abydos.distance.McConnaughey + """ + + cmp = McConnaughey() + cmp_no_d = McConnaughey(alphabet=0) + + def test_mcconnaughey_sim(self): + """Test abydos.distance.McConnaughey.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6681818182 + ) + + def test_mcconnaughey_dist(self): + """Test abydos.distance.McConnaughey.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3318181818 + ) + + def test_mcconnaughey_corr(self): + """Test abydos.distance.McConnaughey.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.3363636364 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_mcewen_michael.py b/tests/distance/test_distance_mcewen_michael.py new file mode 100644 index 000000000..6df38a2f1 --- /dev/null +++ b/tests/distance/test_distance_mcewen_michael.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_mcewen_michael. + +This module contains unit tests for abydos.distance.McEwenMichael +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import McEwenMichael + + +class McEwenMichaelTestCases(unittest.TestCase): + """Test McEwenMichael functions. + + abydos.distance.McEwenMichael + """ + + cmp = McEwenMichael() + cmp_no_d = McEwenMichael(alphabet=0) + + def test_mcewen_michael_sim(self): + """Test abydos.distance.McEwenMichael.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5101520199916701) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4999165520648357) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5076521509) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5076521509) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5076521509) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5076521509) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5178144947 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.1) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.1) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.1) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.1) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2551020408 + ) + + def test_mcewen_michael_dist(self): + """Test abydos.distance.McEwenMichael.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.48984798000832985) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5000834479351643) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4923478491) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4923478491) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4923478491) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4923478491) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4821855053 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.9) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.9) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.9) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.9) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7448979592 + ) + + def test_mcewen_michael_corr(self): + """Test abydos.distance.McEwenMichael.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.020304039983340273) + self.assertEqual( + self.cmp.corr('abcd', 'efgh'), -0.00016689587032858459 + ) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0153043019) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0153043019) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0153043019) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0153043019) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.0356289895 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.8) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.8) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.8) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.8) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.4897959184 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_meta_levenshtein.py b/tests/distance/test_distance_meta_levenshtein.py new file mode 100644 index 000000000..470e867f3 --- /dev/null +++ b/tests/distance/test_distance_meta_levenshtein.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_meta_levenshtein. + +This module contains unit tests for abydos.distance.MetaLevenshtein +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import os +import unittest + +from abydos.corpus import UnigramCorpus +from abydos.distance import Jaccard, MetaLevenshtein +from abydos.tokenizer import QGrams +from abydos.util import download_package, package_path + +from six import PY2 + + +class MetaLevenshteinTestCases(unittest.TestCase): + """Test MetaLevenshtein functions. + + abydos.distance.MetaLevenshtein + """ + + cmp = MetaLevenshtein() + cmp_jac1 = MetaLevenshtein(metric=Jaccard(qval=1)) + + def test_meta_levenshtein_dist(self): + """Test abydos.distance.MetaLevenshtein.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.8463953614713058) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3077801314) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3077801314) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3077801314) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3077801314) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2931752664 + ) + + def test_meta_levenshtein_sim(self): + """Test abydos.distance.MetaLevenshtein.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.15360463852869422) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6922198686) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6922198686) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6922198686) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6922198686) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7068247336 + ) + + self.assertAlmostEqual( + self.cmp_jac1.sim('Nigel', 'Niall'), 0.569107816 + ) + self.assertAlmostEqual( + self.cmp_jac1.sim('Niall', 'Nigel'), 0.569107816 + ) + self.assertAlmostEqual( + self.cmp_jac1.sim('Colin', 'Coiln'), 0.753775895 + ) + self.assertAlmostEqual( + self.cmp_jac1.sim('Coiln', 'Colin'), 0.753775895 + ) + self.assertAlmostEqual( + self.cmp_jac1.sim('ATCAACGAGT', 'AACGATTAG'), 0.5746789477 + ) + + def test_meta_levenshtein_dist_abs(self): + """Test abydos.distance.MetaLevenshtein.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 3.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 3.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 3.385581445885223) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 1.5389006572 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 1.5389006572 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 1.5389006572 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 1.5389006572 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2.9317526638 + ) + + def test_meta_levenshtein_corpus(self): + """Test abydos.distance.MetaLevenshtein with corpus.""" + if PY2: # disable testing in Py2.7; the pickled data isn't supported + return + + q3_corpus = UnigramCorpus(word_tokenizer=QGrams(qval=3)) + download_package('en_qgram', silent=True) + q3_corpus.load_corpus( + os.path.join(package_path('en_qgram'), 'q3_en.dat') + ) + cmp_q3 = MetaLevenshtein(tokenizer=QGrams(qval=3), corpus=q3_corpus) + + self.assertAlmostEqual(cmp_q3.dist_abs('Nigel', 'Niall'), 7.378939370) + self.assertAlmostEqual(cmp_q3.dist_abs('Niall', 'Nigel'), 7.378939370) + self.assertAlmostEqual(cmp_q3.dist_abs('Colin', 'Coiln'), 8.0) + self.assertAlmostEqual(cmp_q3.dist_abs('Coiln', 'Colin'), 8.0) + + self.assertAlmostEqual(cmp_q3.dist('Nigel', 'Niall'), 0.527067098) + self.assertAlmostEqual(cmp_q3.dist('Niall', 'Nigel'), 0.527067098) + self.assertAlmostEqual(cmp_q3.dist('Colin', 'Coiln'), 0.571428571) + self.assertAlmostEqual(cmp_q3.dist('Coiln', 'Colin'), 0.571428571) + + self.assertAlmostEqual(cmp_q3.sim('Nigel', 'Niall'), 0.472932902) + self.assertAlmostEqual(cmp_q3.sim('Niall', 'Nigel'), 0.472932902) + self.assertAlmostEqual(cmp_q3.sim('Colin', 'Coiln'), 0.428571429) + self.assertAlmostEqual(cmp_q3.sim('Coiln', 'Colin'), 0.428571429) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_michelet.py b/tests/distance/test_distance_michelet.py new file mode 100644 index 000000000..32c903c50 --- /dev/null +++ b/tests/distance/test_distance_michelet.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_michelet. + +This module contains unit tests for abydos.distance.Michelet +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Michelet + + +class MicheletTestCases(unittest.TestCase): + """Test Michelet functions. + + abydos.distance.Michelet + """ + + cmp = Michelet() + + def test_michelet_sim(self): + """Test abydos.distance.Michelet.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4454545455 + ) + + def test_michelet_dist(self): + """Test abydos.distance.Michelet.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5545454545 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_minhash.py b/tests/distance/test_distance_minhash.py new file mode 100644 index 000000000..dd0d0c5be --- /dev/null +++ b/tests/distance/test_distance_minhash.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_minhash. + +This module contains unit tests for abydos.distance.MinHash +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import MinHash + + +class MinHashTestCases(unittest.TestCase): + """Test MinHash functions. + + abydos.distance.MinHash + """ + + cmp = MinHash() + + def test_minhash_sim(self): + """Test abydos.distance.MinHash.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.75) + self.assertEqual(self.cmp.sim('', 'abc'), 0.75) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.6) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 1.0) + + def test_minhash_dist(self): + """Test abydos.distance.MinHash.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.25) + self.assertEqual(self.cmp.dist('', 'abc'), 0.25) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.4) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_minkowski.py b/tests/distance/test_distance_minkowski.py index 584cea3c6..56100729b 100644 --- a/tests/distance/test_distance_minkowski.py +++ b/tests/distance/test_distance_minkowski.py @@ -31,7 +31,7 @@ import unittest from abydos.distance import Minkowski, dist_minkowski, minkowski, sim_minkowski -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -43,64 +43,101 @@ class MinkowskiTestCases(unittest.TestCase): """ cmp = Minkowski() + cmp_q2 = Minkowski(tokenizer=QGrams(2)) + cmp_q1p0 = Minkowski(pval=0, tokenizer=QGrams(1)) + cmp_ws = Minkowski(tokenizer=WhitespaceTokenizer()) def test_minkowski_dist_abs(self): """Test abydos.distance.Minkowski.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8) - self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) + self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) - self.assertEqual(self.cmp.dist_abs('', '', 2), 0) - self.assertEqual(self.cmp.dist_abs('nelson', '', 2), 7) - self.assertEqual(self.cmp.dist_abs('', 'neilsen', 2), 8) - self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen', 2), 7) + self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) + self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7) + self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8) + self.assertEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7) # supplied q-gram tests - self.assertEqual(self.cmp.dist_abs(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist_abs(QGrams('nelson'), QGrams('')), 7) - self.assertEqual(self.cmp.dist_abs(QGrams(''), QGrams('neilsen')), 8) - self.assertAlmostEqual( - self.cmp.dist_abs(QGrams('nelson'), QGrams('neilsen')), 7 + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 7, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 8, + ) + self.assertEqual( + self.cmp.dist_abs( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7, ) # non-q-gram tests - self.assertEqual(self.cmp.dist_abs('', '', 0), 0) - self.assertEqual(self.cmp.dist_abs('the quick', '', 0), 2) - self.assertEqual(self.cmp.dist_abs('', 'the quick', 0), 2) - self.assertAlmostEqual(self.cmp.dist_abs(NONQ_FROM, NONQ_TO, 0), 8) - self.assertAlmostEqual(self.cmp.dist_abs(NONQ_TO, NONQ_FROM, 0), 8) + self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) + self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2) + self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2) + self.assertEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8) + self.assertEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8) # test l_0 "norm" - self.assertEqual(self.cmp.dist_abs('', '', 1, 0), 0) - self.assertEqual(self.cmp.dist_abs('a', '', 1, 0), 1) - self.assertEqual(self.cmp.dist_abs('a', 'b', 1, 0), 2) - self.assertEqual(self.cmp.dist_abs('ab', 'b', 1, 0), 1) - self.assertEqual(self.cmp.dist_abs('aab', 'b', 1, 0), 1) - self.assertEqual(self.cmp.dist_abs('', '', 1, 0, True), 0) - self.assertEqual(self.cmp.dist_abs('a', '', 1, 0, True), 1) - self.assertEqual(self.cmp.dist_abs('a', 'b', 1, 0, True), 1) - self.assertEqual(self.cmp.dist_abs('ab', 'b', 1, 0, True), 1 / 2) - self.assertEqual(self.cmp.dist_abs('aab', 'b', 1, 0, True), 1 / 2) - self.assertEqual(self.cmp.dist_abs('aaab', 'b', 1, 0, True), 1 / 2) - self.assertEqual(self.cmp.dist_abs('aaab', 'ab', 1, 0, True), 1 / 2) + self.assertEqual(self.cmp_q1p0.dist_abs('', ''), 0) + self.assertEqual(self.cmp_q1p0.dist_abs('a', ''), 1) + self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b'), 2) + self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b'), 1) + self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b'), 1) + self.assertEqual(self.cmp_q1p0.dist_abs('', '', normalized=True), 0) + self.assertEqual(self.cmp_q1p0.dist_abs('a', '', normalized=True), 1) + self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b', normalized=True), 1) + self.assertEqual( + self.cmp_q1p0.dist_abs('ab', 'b', normalized=True), 1 / 2 + ) + self.assertEqual( + self.cmp_q1p0.dist_abs('aab', 'b', normalized=True), 1 / 2 + ) + self.assertEqual( + self.cmp_q1p0.dist_abs('aaab', 'b', normalized=True), 1 / 2 + ) + self.assertEqual( + self.cmp_q1p0.dist_abs('aaab', 'ab', normalized=True), 1 / 2 + ) # test with alphabet - self.assertEqual(self.cmp.dist_abs('ab', 'b', 1, alphabet=26), 1) self.assertEqual( - self.cmp.dist_abs('ab', 'b', 1, normalized=True, alphabet=26), - 1 / 26, + Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b'), 1 ) self.assertEqual( - self.cmp.dist_abs( - 'ab', - 'b', - 1, - normalized=True, - alphabet='abcdefghijklmnopqrstuvwxyz', + Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs( + 'ab', 'b', normalized=True ), 1 / 26, ) + self.assertEqual( + Minkowski( + tokenizer=QGrams(1), alphabet='abcdefghijklmnopqrstuvwxyz' + ).dist_abs('ab', 'b', normalized=True), + 1 / 26, + ) + + self.assertEqual( + Minkowski(pval=float('inf')).dist_abs('nelsonian', 'neilsen'), 1.0 + ) # Test wrapper self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7) @@ -112,25 +149,47 @@ def test_minkowski_sim(self): self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) - self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 8 / 15) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) + self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 8 / 15 + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 8 / 15, ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) - self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 / 2) - self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 / 2) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_minkowski('nelson', 'neilsen'), 8 / 15) @@ -142,25 +201,47 @@ def test_minkowski_dist(self): self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) - self.assertAlmostEqual(dist_minkowski('nelson', 'neilsen', 2), 7 / 15) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) + self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 7 / 15 + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7 / 15, ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) - self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 1 / 2) - self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 1 / 2) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_minkowski('nelson', 'neilsen'), 7 / 15) diff --git a/tests/distance/test_distance_monge_elkan.py b/tests/distance/test_distance_monge_elkan.py index e0be27a42..ebb3e0928 100644 --- a/tests/distance/test_distance_monge_elkan.py +++ b/tests/distance/test_distance_monge_elkan.py @@ -30,7 +30,12 @@ import unittest -from abydos.distance import MongeElkan, dist_monge_elkan, sim_monge_elkan +from abydos.distance import ( + Jaccard, + MongeElkan, + dist_monge_elkan, + sim_monge_elkan, +) class MongeElkanTestCases(unittest.TestCase): @@ -40,6 +45,8 @@ class MongeElkanTestCases(unittest.TestCase): """ cmp = MongeElkan() + cmp_sym = MongeElkan(symmetric=True) + cmp_jac = MongeElkan(sim_func=Jaccard()) def test_monge_elkan_sim(self): """Test abydos.distance.MongeElkan.sim.""" @@ -52,16 +59,12 @@ def test_monge_elkan_sim(self): self.assertEqual(self.cmp.sim('Niall', 'Niel'), 3 / 4) self.assertEqual(self.cmp.sim('Niall', 'Nigel'), 3 / 4) - self.assertEqual( - self.cmp.sim('Niall', 'Neal', symmetric=True), 31 / 40 - ) - self.assertEqual(self.cmp.sim('Niall', 'Njall', symmetric=True), 5 / 6) - self.assertEqual( - self.cmp.sim('Niall', 'Niel', symmetric=True), 31 / 40 - ) - self.assertAlmostEqual( - self.cmp.sim('Niall', 'Nigel', symmetric=True), 17 / 24 - ) + self.assertEqual(self.cmp_sym.sim('Niall', 'Neal'), 31 / 40) + self.assertEqual(self.cmp_sym.sim('Niall', 'Njall'), 5 / 6) + self.assertEqual(self.cmp_sym.sim('Niall', 'Niel'), 31 / 40) + self.assertAlmostEqual(self.cmp_sym.sim('Niall', 'Nigel'), 17 / 24) + + self.assertEqual(self.cmp_jac.sim('Njall', 'Neil'), 29 / 60) # Test wrapper self.assertEqual(sim_monge_elkan('Niall', 'Neal'), 3 / 4) @@ -76,18 +79,10 @@ def test_monge_elkan_dist(self): self.assertEqual(self.cmp.dist('Niall', 'Niel'), 1 / 4) self.assertEqual(self.cmp.dist('Niall', 'Nigel'), 1 / 4) - self.assertAlmostEqual( - self.cmp.dist('Niall', 'Neal', symmetric=True), 9 / 40 - ) - self.assertAlmostEqual( - self.cmp.dist('Niall', 'Njall', symmetric=True), 1 / 6 - ) - self.assertAlmostEqual( - self.cmp.dist('Niall', 'Niel', symmetric=True), 9 / 40 - ) - self.assertAlmostEqual( - self.cmp.dist('Niall', 'Nigel', symmetric=True), 7 / 24 - ) + self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Neal'), 9 / 40) + self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Njall'), 1 / 6) + self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Niel'), 9 / 40) + self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Nigel'), 7 / 24) # Test wrapper self.assertEqual(dist_monge_elkan('Niall', 'Neal'), 1 / 4) diff --git a/tests/distance/test_distance_mountford.py b/tests/distance/test_distance_mountford.py new file mode 100644 index 000000000..ebf064a2b --- /dev/null +++ b/tests/distance/test_distance_mountford.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_mountford. + +This module contains unit tests for abydos.distance.Mountford +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Mountford + + +class MountfordTestCases(unittest.TestCase): + """Test Mountford functions. + + abydos.distance.Mountford + """ + + cmp = Mountford() + + def test_mountford_sim(self): + """Test abydos.distance.Mountford.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.8) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1666666667) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.1917808219 + ) + + def test_mountford_dist(self): + """Test abydos.distance.Mountford.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.19999999999999996) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8333333333) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.8082191781 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_ms_contingency.py b/tests/distance/test_distance_ms_contingency.py new file mode 100644 index 000000000..fb5a66651 --- /dev/null +++ b/tests/distance/test_distance_ms_contingency.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_ms_contingency. + +This module contains unit tests for abydos.distance.MSContingency +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import MSContingency + + +class MSContingencyTestCases(unittest.TestCase): + """Test MSContingency functions. + + abydos.distance.MSContingency + """ + + cmp = MSContingency() + cmp_no_d = MSContingency(alphabet=0) + cmp_4q1 = MSContingency(qval=1, alphabet=4) + + def test_ms_contingency_sim(self): + """Test abydos.distance.MSContingency.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.49546153904804724) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.8142722325) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.8142722325) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8142722325) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8142722325) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.890704164 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.183772234 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.183772234 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.183772234 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.183772234 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.2782336187 + ) + + def test_ms_contingency_dist(self): + """Test abydos.distance.MSContingency.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5045384609519528) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.1857277675) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.1857277675) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1857277675) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1857277675) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.109295836 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.816227766 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.816227766 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.816227766 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.816227766 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.7217663813 + ) + + def test_ms_contingency_corr(self): + """Test abydos.distance.MSContingency.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.009076921903905553) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.628544465) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.628544465) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.628544465) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.628544465) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.781408328 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.4435327626 + ) + + self.assertEqual(self.cmp_4q1.corr('ab', 'ac'), 0.0) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_mutual_information.py b/tests/distance/test_distance_mutual_information.py new file mode 100644 index 000000000..9e04a93c6 --- /dev/null +++ b/tests/distance/test_distance_mutual_information.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_mutual_information. + +This module contains unit tests for abydos.distance.MutualInformation +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import MutualInformation + + +class MutualInformationTestCases(unittest.TestCase): + """Test MutualInformation functions. + + abydos.distance.MutualInformation + """ + + cmp = MutualInformation() + cmp_no_d = MutualInformation(alphabet=0) + + def test_mutual_information_sim(self): + """Test abydos.distance.MutualInformation.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.1752299652353853) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9284965499) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9284965499) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9284965499) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9284965499) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9481813127 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim('a', 'eh'), -0.9036774610288023 + ) + + def test_mutual_information_dist(self): + """Test abydos.distance.MutualInformation.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.8247700347646147) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0715034501) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0715034501) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0715034501) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0715034501) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0518186873 + ) + + def test_mutual_information_sim_score(self): + """Test abydos.distance.MutualInformation.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 7.527706972593264) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), -4.700439718141093 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 5.9908322396 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 5.9908322396 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 5.9908322396 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 5.9908322396 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 5.6279117576 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.sim_score('abcd', 'efgh'), -4.700439718141093 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), -0.4020984436 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), -0.4020984436 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), -0.4020984436 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), -0.4020984436 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.1650592463 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_ncd_arith.py b/tests/distance/test_distance_ncd_arith.py index d8814154a..9af35ebee 100644 --- a/tests/distance/test_distance_ncd_arith.py +++ b/tests/distance/test_distance_ncd_arith.py @@ -44,32 +44,29 @@ class NCDarithTestCases(unittest.TestCase): arith = Arithmetic(' '.join(NIALL)) cmp = NCDarith() + cmp_probs = NCDarith(arith.get_probs()) def test_ncd_arith_dist(self): """Test abydos.distance.NCDarith.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) - self.assertEqual(self.cmp.dist('', '', self.arith.get_probs()), 0) + self.assertEqual(self.cmp_probs.dist('', ''), 0) self.assertGreater(self.cmp.dist('a', ''), 0) - self.assertGreater(self.cmp.dist('a', '', self.arith.get_probs()), 0) + self.assertGreater(self.cmp_probs.dist('a', ''), 0) self.assertGreater(self.cmp.dist('abcdefg', 'fg'), 0) self.assertAlmostEqual( - self.cmp.dist('Niall', 'Neil', self.arith.get_probs()), - 0.608695652173913, + self.cmp_probs.dist('Niall', 'Neil'), 0.608695652173913 ) self.assertAlmostEqual( - self.cmp.dist('Neil', 'Niall', self.arith.get_probs()), - 0.608695652173913, + self.cmp_probs.dist('Neil', 'Niall'), 0.608695652173913 ) self.assertAlmostEqual(self.cmp.dist('Niall', 'Neil'), 0.6875) self.assertAlmostEqual(self.cmp.dist('Neil', 'Niall'), 0.6875) self.assertAlmostEqual( - self.cmp.dist('Njáll', 'Njall', self.arith.get_probs()), - 0.714285714285714, + self.cmp_probs.dist('Njáll', 'Njall'), 0.714285714285714 ) self.assertAlmostEqual( - self.cmp.dist('Njall', 'Njáll', self.arith.get_probs()), - 0.714285714285714, + self.cmp_probs.dist('Njall', 'Njáll'), 0.714285714285714 ) self.assertAlmostEqual(self.cmp.dist('Njáll', 'Njall'), 0.75) self.assertAlmostEqual(self.cmp.dist('Njall', 'Njáll'), 0.75) @@ -83,28 +80,24 @@ def test_ncd_arith_dist(self): def test_ncd_arith_sim(self): """Test abydos.distance.NCDarith.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) - self.assertEqual(self.cmp.sim('', '', self.arith.get_probs()), 1) + self.assertEqual(self.cmp_probs.sim('', ''), 1) self.assertLess(self.cmp.sim('a', ''), 1) - self.assertLess(self.cmp.sim('a', '', self.arith.get_probs()), 1) + self.assertLess(self.cmp_probs.sim('a', ''), 1) self.assertLess(self.cmp.sim('abcdefg', 'fg'), 1) self.assertAlmostEqual( - self.cmp.sim('Niall', 'Neil', self.arith.get_probs()), - 0.3913043478260869, + self.cmp_probs.sim('Niall', 'Neil'), 0.3913043478260869 ) self.assertAlmostEqual( - self.cmp.sim('Neil', 'Niall', self.arith.get_probs()), - 0.3913043478260869, + self.cmp_probs.sim('Neil', 'Niall'), 0.3913043478260869 ) self.assertAlmostEqual(self.cmp.sim('Niall', 'Neil'), 0.3125) self.assertAlmostEqual(self.cmp.sim('Neil', 'Niall'), 0.3125) self.assertAlmostEqual( - self.cmp.sim('Njáll', 'Njall', self.arith.get_probs()), - 0.285714285714285, + self.cmp_probs.sim('Njáll', 'Njall'), 0.285714285714285 ) self.assertAlmostEqual( - self.cmp.sim('Njall', 'Njáll', self.arith.get_probs()), - 0.285714285714285, + self.cmp_probs.sim('Njall', 'Njáll'), 0.285714285714285 ) self.assertAlmostEqual(self.cmp.sim('Njáll', 'Njall'), 0.25) self.assertAlmostEqual(self.cmp.sim('Njall', 'Njáll'), 0.25) diff --git a/tests/distance/test_distance_ncd_lzss.py b/tests/distance/test_distance_ncd_lzss.py new file mode 100644 index 000000000..53a095756 --- /dev/null +++ b/tests/distance/test_distance_ncd_lzss.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_ncd_lzss. + +This module contains unit tests for abydos.distance.NCDlzss +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import NCDlzss + + +class NCDlzssTestCases(unittest.TestCase): + """Test NCDlzss functions. + + abydos.distance.NCDlzss + """ + + cmp = NCDlzss() + + def test_ncd_lzss_dist(self): + """Test abydos.distance.NCDlzss.dist.""" + try: + import lzss # noqa: F401 + except ImportError: # pragma: no cover + return + + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.8) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_ncd_lzss_sim(self): + """Test abydos.distance.NCDlzss.sim.""" + try: + import lzss # noqa: F401 + except ImportError: # pragma: no cover + return + + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.19999999999999996) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_ncd_paq9a.py b/tests/distance/test_distance_ncd_paq9a.py new file mode 100644 index 000000000..f80dce890 --- /dev/null +++ b/tests/distance/test_distance_ncd_paq9a.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_ncd_paq9a. + +This module contains unit tests for abydos.distance.NCDpaq9a +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import NCDpaq9a + +from six import PY2 + + +class NCDpaq9aTestCases(unittest.TestCase): + """Test NCDpaq9a functions. + + abydos.distance.NCDpaq9a + """ + + cmp = NCDpaq9a() + + def test_ncd_paq9a_dist(self): + """Test abydos.distance.NCDpaq9a.dist.""" + if PY2: # skip tests since paq9a isn't supported on Python 2.7 + return + try: + import paq # noqa: F401 + except ImportError: # pragma: no cover + return + + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.2) + self.assertEqual(self.cmp.dist('', 'a'), 0.2) + self.assertEqual(self.cmp.dist('abc', ''), 0.42857142857142855) + self.assertEqual(self.cmp.dist('', 'abc'), 0.42857142857142855) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5555555556) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5555555556) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5555555556) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5555555556) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.6153846154 + ) + + def test_ncd_paq9a_sim(self): + """Test abydos.distance.NCDpaq9a.sim.""" + if PY2: # skip tests since paq9a isn't supported on Python 2.7 + return + + try: + import paq # noqa: F401 + except ImportError: # pragma: no cover + return + + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.8) + self.assertEqual(self.cmp.sim('', 'a'), 0.8) + self.assertEqual(self.cmp.sim('abc', ''), 0.5714285714285714) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5714285714285714) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4444444444) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4444444444) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4444444444) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4444444444) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3846153846 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_needleman_wunsch.py b/tests/distance/test_distance_needleman_wunsch.py index 1804fe35d..6308664e4 100644 --- a/tests/distance/test_distance_needleman_wunsch.py +++ b/tests/distance/test_distance_needleman_wunsch.py @@ -132,25 +132,28 @@ class NeedlemanWunschTestCases(unittest.TestCase): abydos.distance.NeedlemanWunsch """ - cmp = NeedlemanWunsch() - def test_needleman_wunsch_dist_abs(self): """Test abydos.distance.NeedlemanWunsch.dist_abs.""" - self.assertEqual(needleman_wunsch('', ''), 0) + self.assertEqual(NeedlemanWunsch().dist_abs('', ''), 0) # https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm - self.assertEqual(needleman_wunsch('GATTACA', 'GCATGCU', 1, _sim_nw), 0) - self.assertEqual( - needleman_wunsch('AGACTAGTTAC', 'CGAGACGT', 5, _sim_wikipedia), 16 - ) - - # checked against http://ds9a.nl/nwunsch/ (mismatch=1, gap=5, skew=5) self.assertEqual( - needleman_wunsch('CGATATCAG', 'TGACGSTGC', 5, _sim_nw), -5 + NeedlemanWunsch(1, _sim_nw).dist_abs('GATTACA', 'GCATGCU'), 0 ) self.assertEqual( - needleman_wunsch('AGACTAGTTAC', 'TGACGSTGC', 5, _sim_nw), -7 + NeedlemanWunsch(5, _sim_wikipedia).dist_abs( + 'AGACTAGTTAC', 'CGAGACGT' + ), + 16, ) + + # checked against http://ds9a.nl/nwunsch/ (mismatch=1, gap=5, skew=5) + nw5 = NeedlemanWunsch(5, _sim_nw) + self.assertEqual(nw5.dist_abs('CGATATCAG', 'TGACGSTGC'), -5) + self.assertEqual(nw5.dist_abs('AGACTAGTTAC', 'TGACGSTGC'), -7) + self.assertEqual(nw5.dist_abs('AGACTAGTTAC', 'CGAGACGT'), -15) + + # test wrapper self.assertEqual( needleman_wunsch('AGACTAGTTAC', 'CGAGACGT', 5, _sim_nw), -15 ) @@ -159,10 +162,9 @@ def test_needleman_wunsch_dist_abs_nialls(self): """Test abydos.distance.NeedlemanWunsch.dist_abs (Nialls set).""" # checked against http://ds9a.nl/nwunsch/ (mismatch=1, gap=2, skew=2) nw_vals = (5, 0, -2, 3, 1, 1, -2, -2, -1, -3, -3, -5, -3, -7, -7, -19) + nw2 = NeedlemanWunsch(2, _sim_nw) for i in range(len(NIALL)): - self.assertEqual( - needleman_wunsch(NIALL[0], NIALL[i], 2, _sim_nw), nw_vals[i] - ) + self.assertEqual(nw2.dist_abs(NIALL[0], NIALL[i]), nw_vals[i]) if __name__ == '__main__': diff --git a/tests/distance/test_distance_overlap.py b/tests/distance/test_distance_overlap.py index ec7ed900c..fa26feed5 100644 --- a/tests/distance/test_distance_overlap.py +++ b/tests/distance/test_distance_overlap.py @@ -31,7 +31,7 @@ import unittest from abydos.distance import Overlap, dist_overlap, sim_overlap -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -43,6 +43,8 @@ class OverlapTestCases(unittest.TestCase): """ cmp = Overlap() + cmp_q2 = Overlap(tokenizer=QGrams(2)) + cmp_ws = Overlap(tokenizer=WhitespaceTokenizer()) def test_overlap_sim(self): """Test abydos.distance.Overlap.sim.""" @@ -51,25 +53,47 @@ def test_overlap_sim(self): self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 7) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) - self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 4 / 7) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) + self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 7) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 4 / 7 + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 4 / 7, ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) - self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 4 / 7) - self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 4 / 7) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / 7) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / 7) # Test wrapper self.assertAlmostEqual(sim_overlap('nelson', 'neilsen'), 4 / 7) @@ -81,25 +105,47 @@ def test_overlap_dist(self): self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 3 / 7) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) - self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 2), 3 / 7) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) + self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 3 / 7) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 3 / 7 + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 3 / 7, ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) - self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 3 / 7) - self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 3 / 7) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 3 / 7) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 3 / 7) # Test wrapper self.assertAlmostEqual(dist_overlap('nelson', 'neilsen'), 3 / 7) diff --git a/tests/distance/test_distance_ozbay.py b/tests/distance/test_distance_ozbay.py new file mode 100644 index 000000000..0b3b01921 --- /dev/null +++ b/tests/distance/test_distance_ozbay.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_ozbay. + +This module contains unit tests for abydos.distance.Ozbay +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Ozbay + + +class OzbayTestCases(unittest.TestCase): + """Test Ozbay metric functions. + + abydos.distance.Ozbay + """ + + cmp = Ozbay() + + def test_ozbay_dist_abs(self): + """Test abydos.distance.Ozbay.dist_abs.""" + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + + self.assertAlmostEqual( + self.cmp.dist_abs('piccadilly', 'bandage'), 73.63636363636363 + ) + self.assertAlmostEqual(self.cmp.dist_abs('abcd', 'efgh'), 16) + + # Test cases from https://github.com/hakanozbay/ozbay-metric + self.assertEqual(self.cmp.dist_abs('ban', 'ban'), 0.0) + self.assertAlmostEqual(self.cmp.dist_abs('ban', 'bane'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist_abs('ban', 'band'), 0.3333333333) + self.assertEqual(self.cmp.dist_abs('ban', 'bat'), 0.75) + self.assertAlmostEqual(self.cmp.dist_abs('ban', 'bands'), 1.3333333333) + self.assertEqual(self.cmp.dist_abs('ban', 'banana'), 2.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ban', 'bandana'), 2.3333333333 + ) + self.assertEqual(self.cmp.dist_abs('ban', 'bandit'), 3.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ban', 'bandage'), 4.6666666666 + ) + + self.assertEqual(self.cmp.dist_abs('piccadilly', 'piccadilly'), 0.0) + self.assertEqual(self.cmp.dist_abs('piccadilly', 'piccadilyl'), 0.25) + self.assertAlmostEqual( + self.cmp.dist_abs('piccadilly', 'piccadlily'), 0.3333333333 + ) + self.assertEqual(self.cmp.dist_abs('piccadilly', 'picacdilly'), 0.4) + self.assertEqual(self.cmp.dist_abs('piccadilly', 'picadily'), 0.4) + self.assertEqual(self.cmp.dist_abs('picadily', 'piccadilly'), 0.5) + self.assertAlmostEqual( + self.cmp.dist_abs('piccadilly', 'picacdlily'), 1.3333333333 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ipcacdily', 'piccadilly'), 1.4814814814814814 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('piccadilly', 'ipcacdily'), 1.333333333 + ) + self.assertEqual(self.cmp.dist_abs('piccadilly', 'pcicadlyil'), 2.0) + + def test_ozbay_dist(self): + """Test abydos.distance.Ozbay.dist.""" + self.assertEqual(self.cmp.dist('', ''), 0) + + self.assertAlmostEqual( + self.cmp.dist('piccadilly', 'bandage'), 0.9467532467532467 + ) + self.assertAlmostEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + # Test cases from https://github.com/hakanozbay/ozbay-metric + self.assertEqual(self.cmp.dist('ban', 'ban'), 0.0) + self.assertAlmostEqual( + self.cmp.dist('ban', 'bane'), 0.006944444444444444 + ) + self.assertAlmostEqual( + self.cmp.dist('ban', 'band'), 0.006944444444444444 + ) + self.assertEqual(self.cmp.dist('ban', 'bat'), 0.02777777777777778) + self.assertAlmostEqual( + self.cmp.dist('ban', 'bands'), 0.03555555555555556 + ) + self.assertEqual(self.cmp.dist('ban', 'banana'), 0.05555555555555555) + self.assertAlmostEqual( + self.cmp.dist('ban', 'bandana'), 0.0634920634920635 + ) + self.assertEqual(self.cmp.dist('ban', 'bandit'), 0.08333333333333333) + self.assertAlmostEqual( + self.cmp.dist('ban', 'bandage'), 0.126984126984127 + ) + + self.assertEqual(self.cmp.dist('piccadilly', 'piccadilly'), 0.0) + self.assertEqual( + self.cmp.dist('piccadilly', 'piccadilyl'), 0.0004999999999999999 + ) + self.assertAlmostEqual( + self.cmp.dist('piccadilly', 'piccadlily'), 0.0013333333333333335 + ) + self.assertEqual(self.cmp.dist('piccadilly', 'picacdilly'), 0.002) + self.assertEqual(self.cmp.dist('piccadilly', 'picadily'), 0.0025) + self.assertEqual(self.cmp.dist('picadily', 'piccadilly'), 0.003125) + self.assertAlmostEqual( + self.cmp.dist('piccadilly', 'picacdlily'), 0.009333333333333334 + ) + self.assertAlmostEqual( + self.cmp.dist('ipcacdily', 'piccadilly'), 0.011522633744855966 + ) + self.assertAlmostEqual( + self.cmp.dist('piccadilly', 'ipcacdily'), 0.01037037037037037 + ) + self.assertEqual(self.cmp.dist('piccadilly', 'pcicadlyil'), 0.014) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_pattern.py b/tests/distance/test_distance_pattern.py new file mode 100644 index 000000000..30332e67f --- /dev/null +++ b/tests/distance/test_distance_pattern.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_pattern. + +This module contains unit tests for abydos.distance.Pattern +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Pattern + + +class PatternTestCases(unittest.TestCase): + """Test Pattern functions. + + abydos.distance.Pattern + """ + + cmp = Pattern() + cmp_no_d = Pattern(alphabet=0) + + def test_pattern_dist(self): + """Test abydos.distance.Pattern.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.0001626926280716368) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 5.85693e-05) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 5.85693e-05) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 5.85693e-05) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 5.85693e-05) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 7.80925e-05 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.2448979592 + ) + + def test_pattern_sim(self): + """Test abydos.distance.Pattern.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9998373073719283) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9999414307) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9999414307) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9999414307) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9999414307) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9999219075 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.7551020408 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_pearson_chi_squared.py b/tests/distance/test_distance_pearson_chi_squared.py new file mode 100644 index 000000000..5bd1fd401 --- /dev/null +++ b/tests/distance/test_distance_pearson_chi_squared.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_pearson_chi_squared. + +This module contains unit tests for abydos.distance.PearsonChiSquared +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PearsonChiSquared +from abydos.tokenizer import QSkipgrams + + +class PearsonChiSquaredTestCases(unittest.TestCase): + """Test PearsonChiSquared functions. + + abydos.distance.PearsonChiSquared + """ + + cmp = PearsonChiSquared() + cmp_no_d = PearsonChiSquared(alphabet=0) + + def test_pearson_chi_squared_sim(self): + """Test abydos.distance.PearsonChiSquared.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4999794015236281) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.623079414) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.623079414) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.623079414) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.623079414) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7197346065 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.375) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.375) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.375) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.375) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4454545455 + ) + + def test_pearson_chi_squared_dist(self): + """Test abydos.distance.PearsonChiSquared.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5000205984763719) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.376920586) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.376920586) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.376920586) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.376920586) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2802653935 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.625) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.625) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.625) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.625) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5545454545 + ) + + def test_pearson_chi_squared_sim_score(self): + """Test abydos.distance.PearsonChiSquared.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 784.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 784.0) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), 0.032298410951138765 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 192.9885210909 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 192.9885210909 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 192.9885210909 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 192.9885210909 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 344.5438630111 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 4.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 10.0) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 2.25) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 2.25) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 2.25) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 2.25) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 1.5272727273 + ) + + self.assertEqual( + PearsonChiSquared( + alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK') + ).sim_score('a', 'eh'), + 0.0, + ) + + def test_pearson_chi_squared_corr(self): + """Test abydos.distance.PearsonChiSquared.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual( + self.cmp.corr('abcd', 'efgh'), -4.1196952743799446e-05 + ) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.2461588279) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.2461588279) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.2461588279) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.2461588279) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.439469213 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.25) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.25) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.25) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.25) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.1090909091 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_pearson_heron_ii.py b/tests/distance/test_distance_pearson_heron_ii.py new file mode 100644 index 000000000..9068d0620 --- /dev/null +++ b/tests/distance/test_distance_pearson_heron_ii.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_pearson_heron_ii. + +This module contains unit tests for abydos.distance.PearsonHeronII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PearsonHeronII + + +class PearsonHeronIITestCases(unittest.TestCase): + """Test PearsonHeronII functions. + + abydos.distance.PearsonHeronII + """ + + cmp = PearsonHeronII() + cmp_no_d = PearsonHeronII(alphabet=0) + + def test_pearson_heron_ii_sim(self): + """Test abydos.distance.PearsonHeronII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9915587467) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9915587467) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9915587467) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9915587467) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9949989546 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_pearson_heron_ii_dist(self): + """Test abydos.distance.PearsonHeronII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0084412533) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0084412533) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0084412533) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0084412533) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0050010454 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_pearson_heron_ii_corr(self): + """Test abydos.distance.PearsonHeronII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9831174935) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9831174935) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9831174935) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9831174935) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9899979092 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_pearson_ii.py b/tests/distance/test_distance_pearson_ii.py new file mode 100644 index 000000000..947d8a8b1 --- /dev/null +++ b/tests/distance/test_distance_pearson_ii.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_pearson_ii. + +This module contains unit tests for abydos.distance.PearsonII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PearsonII + + +class PearsonIITestCases(unittest.TestCase): + """Test PearsonII functions. + + abydos.distance.PearsonII + """ + + cmp = PearsonII() + cmp_no_d = PearsonII(alphabet=0) + + def test_pearson_ii_sim(self): + """Test abydos.distance.PearsonII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.009076921903905551) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.628544465) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.628544465) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.628544465) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.628544465) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.781408328 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.632455532 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4435327626 + ) + + def test_pearson_ii_dist(self): + """Test abydos.distance.PearsonII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9909230780960945) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.371455535) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.371455535) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.371455535) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.371455535) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.218591672 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.367544468 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.367544468 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.367544468 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.367544468 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5564672374 + ) + + def test_pearson_ii_sim_score(self): + """Test abydos.distance.PearsonII.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.7071067811865476) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 0.7071067811865476) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), 0.006418353030552324 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 0.4444480535 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 0.4444480535 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 0.4444480535 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 0.4444480535 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.5525391276 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.7071067811865476) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual( + self.cmp_no_d.sim_score('abc', 'abc'), 0.7071067811865476 + ) + self.assertEqual( + self.cmp_no_d.sim_score('abcd', 'efgh'), 0.7071067811865476 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), 0.4472135955 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), 0.4472135955 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), 0.4472135955 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), 0.4472135955 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.3136250241 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_pearson_iii.py b/tests/distance/test_distance_pearson_iii.py new file mode 100644 index 000000000..1ac8155a3 --- /dev/null +++ b/tests/distance/test_distance_pearson_iii.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_pearson_iii. + +This module contains unit tests for abydos.distance.PearsonIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PearsonIII + + +class PearsonIIITestCases(unittest.TestCase): + """Test PearsonIII functions. + + abydos.distance.PearsonIII + """ + + cmp = PearsonIII() + cmp_no_d = PearsonIII(alphabet=0) + + def test_pearson_iii_sim(self): + """Test abydos.distance.PearsonIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5178457652562063) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5178457652562063) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.49856936111823147) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5125741446) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5125741446) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5125741446) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5125741446) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5145331766 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.7236067977499789) + self.assertEqual( + self.cmp_no_d.sim('abcd', 'efgh'), 0.33333333333333337 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3787321875 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3787321875 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3787321875 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3787321875 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.422279161 + ) + + def test_pearson_iii_dist(self): + """Test abydos.distance.PearsonIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.4821542347437937) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.4821542347437937) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5014306388817685) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4874258554) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4874258554) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4874258554) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4874258554) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4854668234 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.27639320225002106) + self.assertEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.6666666666666666 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6212678125 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6212678125 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6212678125 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6212678125 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.577720839 + ) + + def test_pearson_iii_corr(self): + """Test abydos.distance.PearsonIII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.03569153051241248) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.03569153051241248) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.0028612777635371113) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0251482893) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0251482893) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0251482893) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0251482893) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.0290663533 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.4472135954999579) + self.assertEqual( + self.cmp_no_d.corr('abcd', 'efgh'), -0.3333333333333333 + ) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.242535625 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.242535625 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.242535625 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.242535625 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.155441678 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_pearson_phi.py b/tests/distance/test_distance_pearson_phi.py new file mode 100644 index 000000000..7da27f643 --- /dev/null +++ b/tests/distance/test_distance_pearson_phi.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_pearson_phi. + +This module contains unit tests for abydos.distance.PearsonPhi +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PearsonPhi + + +class PearsonPhiTestCases(unittest.TestCase): + """Test PearsonPhi functions. + + abydos.distance.PearsonPhi + """ + + cmp = PearsonPhi() + cmp_no_d = PearsonPhi(alphabet=0) + + def test_pearson_phi_sim(self): + """Test abydos.distance.PearsonPhi.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8314623708 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3348554352 + ) + + def test_pearson_phi_dist(self): + """Test abydos.distance.PearsonPhi.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1685376292 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6651445648 + ) + + def test_pearson_phi_corr(self): + """Test abydos.distance.PearsonPhi.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6629247416 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3302891295 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_peirce.py b/tests/distance/test_distance_peirce.py new file mode 100644 index 000000000..3451e208d --- /dev/null +++ b/tests/distance/test_distance_peirce.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_peirce. + +This module contains unit tests for abydos.distance.Peirce +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Peirce + + +class PeirceTestCases(unittest.TestCase): + """Test Peirce functions. + + abydos.distance.Peirce + """ + + cmp = Peirce() + cmp_no_d = Peirce(alphabet=0) + + def test_peirce_sim(self): + """Test abydos.distance.Peirce.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8162413266 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3181818182 + ) + + def test_peirce_dist(self): + """Test abydos.distance.Peirce.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1837586734 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6818181818 + ) + + def test_peirce_corr(self): + """Test abydos.distance.Peirce.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6324826532 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3636363636 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_positional_q_gram_dice.py b/tests/distance/test_distance_positional_q_gram_dice.py new file mode 100644 index 000000000..61fe66682 --- /dev/null +++ b/tests/distance/test_distance_positional_q_gram_dice.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_positional_q_gram_dice. + +This module contains unit tests for abydos.distance.PositionalQGramDice +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PositionalQGramDice + + +class PositionalQGramDiceTestCases(unittest.TestCase): + """Test PositionalQGramDice functions. + + abydos.distance.PositionalQGramDice + """ + + cmp = PositionalQGramDice() + + def test_positional_q_gram_dice_sim(self): + """Test abydos.distance.PositionalQGramDice.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.1904761905 + ) + + def test_positional_q_gram_dice_dist(self): + """Test abydos.distance.PositionalQGramDice.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.8095238095 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_positional_q_gram_jaccard.py b/tests/distance/test_distance_positional_q_gram_jaccard.py new file mode 100644 index 000000000..0463e7268 --- /dev/null +++ b/tests/distance/test_distance_positional_q_gram_jaccard.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_positional_q_gram_jaccard. + +This module contains unit tests for abydos.distance.PositionalQGramJaccard +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PositionalQGramJaccard + + +class PositionalQGramJaccardTestCases(unittest.TestCase): + """Test PositionalQGramJaccard functions. + + abydos.distance.PositionalQGramJaccard + """ + + cmp = PositionalQGramJaccard() + + def test_positional_q_gram_jaccard_sim(self): + """Test abydos.distance.PositionalQGramJaccard.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.1052631579 + ) + + def test_positional_q_gram_jaccard_dist(self): + """Test abydos.distance.PositionalQGramJaccard.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6666666667) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.8947368421 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_positional_q_gram_overlap.py b/tests/distance/test_distance_positional_q_gram_overlap.py new file mode 100644 index 000000000..9168740aa --- /dev/null +++ b/tests/distance/test_distance_positional_q_gram_overlap.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_positional_q_gram_overlap. + +This module contains unit tests for abydos.distance.PositionalQGramOverlap +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import PositionalQGramOverlap + + +class PositionalQGramOverlapTestCases(unittest.TestCase): + """Test PositionalQGramOverlap functions. + + abydos.distance.PositionalQGramOverlap + """ + + cmp = PositionalQGramOverlap() + + def test_positional_q_gram_overlap_sim(self): + """Test abydos.distance.PositionalQGramOverlap.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.2) + + def test_positional_q_gram_overlap_dist(self): + """Test abydos.distance.PositionalQGramOverlap.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.8) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_q_gram.py b/tests/distance/test_distance_q_gram.py new file mode 100644 index 000000000..078426cf4 --- /dev/null +++ b/tests/distance/test_distance_q_gram.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_q_gram. + +This module contains unit tests for abydos.distance.QGram +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import QGram +from abydos.tokenizer import WhitespaceTokenizer + + +class QGramTestCases(unittest.TestCase): + """Test QGram functions. + + abydos.distance.QGram + """ + + cmp = QGram() + cmp_q1 = QGram(qval=1) + cmp_ws = QGram(tokenizer=WhitespaceTokenizer()) + + def test_q_gram_dist(self): + """Test abydos.distance.QGram.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8571428571) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8571428571) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8571428571) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8571428571) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4545454545 + ) + + def test_q_gram_sim(self): + """Test abydos.distance.QGram.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1428571429) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1428571429) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1428571429) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1428571429) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5454545455 + ) + + def test_q_gram_dist_abs(self): + """Test abydos.distance.QGram.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 2) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 2) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 6) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 6) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 6) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 6) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 6) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 5) + + # Example from paper + self.assertEqual(self.cmp.dist_abs('01000', '001111'), 5) + + # Coverage + self.assertEqual(self.cmp_q1.dist_abs('01000', '001111'), 5) + self.assertEqual(self.cmp_ws.dist_abs('a a b b c', 'a b b b c d'), 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_quantitative_cosine.py b/tests/distance/test_distance_quantitative_cosine.py new file mode 100644 index 000000000..582d82281 --- /dev/null +++ b/tests/distance/test_distance_quantitative_cosine.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_quantitative_cosine. + +This module contains unit tests for abydos.distance.QuantitativeCosine +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import QuantitativeCosine + + +class QuantitativeCosineTestCases(unittest.TestCase): + """Test QuantitativeCosine functions. + + abydos.distance.QuantitativeCosine + """ + + cmp = QuantitativeCosine() + + def test_quantitative_cosine_sim(self): + """Test abydos.distance.QuantitativeCosine.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6674238125 + ) + + def test_quantitative_cosine_dist(self): + """Test abydos.distance.QuantitativeCosine.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3325761875 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_quantitative_dice.py b/tests/distance/test_distance_quantitative_dice.py new file mode 100644 index 000000000..33e725138 --- /dev/null +++ b/tests/distance/test_distance_quantitative_dice.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_quantitative_dice. + +This module contains unit tests for abydos.distance.QuantitativeDice +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import QuantitativeDice + + +class QuantitativeDiceTestCases(unittest.TestCase): + """Test QuantitativeDice functions. + + abydos.distance.QuantitativeDice + """ + + cmp = QuantitativeDice() + + def test_quantitative_dice_sim(self): + """Test abydos.distance.QuantitativeDice.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + def test_quantitative_dice_dist(self): + """Test abydos.distance.QuantitativeDice.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_quantitative_jaccard.py b/tests/distance/test_distance_quantitative_jaccard.py new file mode 100644 index 000000000..1d86fae1d --- /dev/null +++ b/tests/distance/test_distance_quantitative_jaccard.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_quantitative_jaccard. + +This module contains unit tests for abydos.distance.QuantitativeJaccard +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import QuantitativeJaccard + + +class QuantitativeJaccardTestCases(unittest.TestCase): + """Test QuantitativeJaccard functions. + + abydos.distance.QuantitativeJaccard + """ + + cmp = QuantitativeJaccard() + + def test_quantitative_jaccard_sim(self): + """Test abydos.distance.QuantitativeJaccard.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_quantitative_jaccard_dist(self): + """Test abydos.distance.QuantitativeJaccard.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6666666667) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_rees_levenshtein.py b/tests/distance/test_distance_rees_levenshtein.py new file mode 100644 index 000000000..fb8c97227 --- /dev/null +++ b/tests/distance/test_distance_rees_levenshtein.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_rees_levenshtein. + +This module contains unit tests for abydos.distance.ReesLevenshtein +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ReesLevenshtein + + +class ReesLevenshteinTestCases(unittest.TestCase): + """Test ReesLevenshtein functions. + + abydos.distance.ReesLevenshtein + """ + + cmp = ReesLevenshtein() + + def test_rees_levenshtein_dist(self): + """Test abydos.distance.ReesLevenshtein.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_rees_levenshtein_sim(self): + """Test abydos.distance.ReesLevenshtein.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5) + + def test_rees_levenshtein_dist_abs(self): + """Test abydos.distance.ReesLevenshtein.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abc', ''), 3) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 5) + + # Example from paper + self.assertEqual(self.cmp.dist_abs('Panulirus', 'Palinurus'), 2) + + # Coverage + self.assertEqual(self.cmp.dist_abs('a', 'b'), 1) + self.assertEqual(self.cmp.dist_abs('aaa', 'aaab'), 1) + self.assertEqual(self.cmp.dist_abs('aaa', 'baa'), 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_roberts.py b/tests/distance/test_distance_roberts.py new file mode 100644 index 000000000..73d9bfe9f --- /dev/null +++ b/tests/distance/test_distance_roberts.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_roberts. + +This module contains unit tests for abydos.distance.Roberts +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Roberts + + +class RobertsTestCases(unittest.TestCase): + """Test Roberts functions. + + abydos.distance.Roberts + """ + + cmp = Roberts() + + def test_roberts_sim(self): + """Test abydos.distance.Roberts.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + def test_roberts_dist(self): + """Test abydos.distance.Roberts.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_rogers_tanimoto.py b/tests/distance/test_distance_rogers_tanimoto.py new file mode 100644 index 000000000..91fcead6f --- /dev/null +++ b/tests/distance/test_distance_rogers_tanimoto.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_rogers_tanimoto. + +This module contains unit tests for abydos.distance.RogersTanimoto +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import RogersTanimoto + + +class RogersTanimotoTestCases(unittest.TestCase): + """Test RogersTanimoto functions. + + abydos.distance.RogersTanimoto + """ + + cmp = RogersTanimoto() + cmp_no_d = RogersTanimoto(alphabet=0) + + def test_rogers_tanimoto_sim(self): + """Test abydos.distance.RogersTanimoto.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9949109414758269) + self.assertEqual(self.cmp.sim('', 'a'), 0.9949109414758269) + self.assertEqual(self.cmp.sim('abc', ''), 0.9898477157360406) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9898477157360406) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9748110831234257) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9848101266) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9848101266) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9848101266) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9848101266) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.982300885 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.2) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.2) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + def test_rogers_tanimoto_dist(self): + """Test abydos.distance.RogersTanimoto.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0050890585241730735) + self.assertEqual(self.cmp.dist('', 'a'), 0.0050890585241730735) + self.assertEqual(self.cmp.dist('abc', ''), 0.010152284263959421) + self.assertEqual(self.cmp.dist('', 'abc'), 0.010152284263959421) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.02518891687657432) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0151898734) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0151898734) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0151898734) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0151898734) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.017699115 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.8) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.8) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_rogot_goldberg.py b/tests/distance/test_distance_rogot_goldberg.py new file mode 100644 index 000000000..568535568 --- /dev/null +++ b/tests/distance/test_distance_rogot_goldberg.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_rogot_goldberg. + +This module contains unit tests for abydos.distance.RogotGoldberg +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import RogotGoldberg + + +class RogotGoldbergTestCases(unittest.TestCase): + """Test RogotGoldberg functions. + + abydos.distance.RogotGoldberg + """ + + cmp = RogotGoldberg() + cmp_no_d = RogotGoldberg(alphabet=0) + + def test_rogot_goldberg_sim(self): + """Test abydos.distance.RogotGoldberg.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.49936143039591313) + self.assertEqual(self.cmp.sim('', 'a'), 0.49936143039591313) + self.assertEqual(self.cmp.sim('abc', ''), 0.49872122762148335) + self.assertEqual(self.cmp.sim('', 'abc'), 0.49872122762148335) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8310708899 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + def test_rogot_goldberg_dist(self): + """Test abydos.distance.RogotGoldberg.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5006385696040869) + self.assertEqual(self.cmp.dist('', 'a'), 0.5006385696040869) + self.assertEqual(self.cmp.dist('abc', ''), 0.5012787723785166) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5012787723785166) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1689291101 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_rouge_l.py b/tests/distance/test_distance_rouge_l.py new file mode 100644 index 000000000..252fadbbb --- /dev/null +++ b/tests/distance/test_distance_rouge_l.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_rouge_l. + +This module contains unit tests for abydos.distance.RougeL +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import RougeL + + +class RougeLTestCases(unittest.TestCase): + """Test RougeL functions. + + abydos.distance.RougeL + """ + + cmp = RougeL() + + def test_rouge_l_sim(self): + """Test abydos.distance.RougeL.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6009244992 + ) + + # Examples from paper + self.assertEqual(self.cmp.sim('pktg', 'pitg', beta=1), 0.75) + self.assertEqual(self.cmp.sim('pktg', 'tgip', beta=1), 0.5) + + def test_rouge_l_dist(self): + """Test abydos.distance.RougeL.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3990755008 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_rouge_s.py b/tests/distance/test_distance_rouge_s.py new file mode 100644 index 000000000..a97b2a124 --- /dev/null +++ b/tests/distance/test_distance_rouge_s.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_rouge_s. + +This module contains unit tests for abydos.distance.RougeS +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import RougeS + + +class RougeSTestCases(unittest.TestCase): + """Test RougeS functions. + + abydos.distance.RougeS + """ + + cmp = RougeS() + + def test_rouge_s_sim(self): + """Test abydos.distance.RougeS.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7578875171 + ) + + # Examples from paper + self.assertEqual(round(self.cmp.sim('pktg', 'pitg', beta=1), 3), 0.5) + self.assertEqual(round(self.cmp.sim('pktg', 'tgip', beta=1), 3), 0.167) + self.assertEqual(round(self.cmp.sim('pktg', 'tgpk', beta=1), 3), 0.333) + + def test_rouge_s_dist(self): + """Test abydos.distance.RougeS.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2421124829 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_rouge_su.py b/tests/distance/test_distance_rouge_su.py new file mode 100644 index 000000000..0c9456349 --- /dev/null +++ b/tests/distance/test_distance_rouge_su.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_rouge_su. + +This module contains unit tests for abydos.distance.RougeSU +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import RougeSU + + +class RougeSUTestCases(unittest.TestCase): + """Test RougeSU functions. + + abydos.distance.RougeSU + """ + + cmp = RougeSU() + + def test_rouge_su_sim(self): + """Test abydos.distance.RougeSU.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9333333333) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9333333333) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7840112202 + ) + + def test_rouge_su_dist(self): + """Test abydos.distance.RougeSU.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0666666667) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0666666667) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2159887798 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_rouge_w.py b/tests/distance/test_distance_rouge_w.py new file mode 100644 index 000000000..c8ff4e1bc --- /dev/null +++ b/tests/distance/test_distance_rouge_w.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_rouge_w. + +This module contains unit tests for abydos.distance.RougeW +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import RougeW + + +class RougeWTestCases(unittest.TestCase): + """Test RougeW functions. + + abydos.distance.RougeW + """ + + cmp = RougeW() + cmp_cubed = RougeW(f_func=lambda x: x ** 3, f_inv=lambda x: x ** (1 / 3)) + + def test_rouge_w_sim(self): + """Test abydos.distance.RougeW.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4472135955) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4472135955) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4898979486) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4898979486) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.548566506 + ) + + # Examples from paper + self.assertEqual(round(self.cmp.sim('ABCDEFG', 'ABCDHIK'), 3), 0.571) + self.assertEqual(round(self.cmp.sim('ABCDEFG', 'AHBKCID'), 3), 0.286) + + # Coverage + self.assertAlmostEqual( + self.cmp_cubed.sim('Nigel', 'Niall'), 0.4160167646 + ) + self.assertAlmostEqual( + self.cmp_cubed.sim('Colin', 'Coiln'), 0.4308869380 + ) + self.assertAlmostEqual( + self.cmp_cubed.sim('ATCAACGAGT', 'AACGATTAG'), 0.5125114739 + ) + + def test_rouge_w_dist(self): + """Test abydos.distance.RougeW.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5527864045) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5527864045) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5101020514) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5101020514) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.451433494 + ) + + def test_rouge_w_wlcs(self): + """Test abydos.distance.RougeW.wlcs.""" + self.assertEqual(self.cmp.wlcs('', ''), 0) + self.assertEqual(self.cmp.wlcs('a', ''), 0) + self.assertEqual(self.cmp.wlcs('', 'a'), 0) + self.assertEqual(self.cmp.wlcs('abc', ''), 0) + self.assertEqual(self.cmp.wlcs('', 'abc'), 0) + self.assertEqual(self.cmp.wlcs('abc', 'abc'), 9) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_russell_rao.py b/tests/distance/test_distance_russell_rao.py new file mode 100644 index 000000000..49945c357 --- /dev/null +++ b/tests/distance/test_distance_russell_rao.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_russell_rao. + +This module contains unit tests for abydos.distance.RussellRao +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import RussellRao + + +class RussellRaoTestCases(unittest.TestCase): + """Test RussellRao functions. + + abydos.distance.RussellRao + """ + + cmp = RussellRao() + cmp_no_d = RussellRao(alphabet=0) + + def test_russell_rao_sim(self): + """Test abydos.distance.RussellRao.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.00510204081632653) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0038265306) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0038265306) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0038265306) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0038265306) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0089285714 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_russell_rao_dist(self): + """Test abydos.distance.RussellRao.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9961734694) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9961734694) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9961734694) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9961734694) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.9910714286 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_saps.py b/tests/distance/test_distance_saps.py new file mode 100644 index 000000000..2931ba05a --- /dev/null +++ b/tests/distance/test_distance_saps.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_saps. + +This module contains unit tests for abydos.distance.SAPS +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SAPS +from abydos.tokenizer import QGrams + + +class SAPSTestCases(unittest.TestCase): + """Test SAPS functions. + + abydos.distance.SAPS + """ + + cmp = SAPS() + cmp_q2 = SAPS(tokenizer=QGrams(2)) + + def test_saps_sim(self): + """Test abydos.distance.SAPS.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0666666667) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4333333333 + ) + + # Coverage + self.assertAlmostEqual( + self.cmp_q2.sim('Stevenson', 'Stinson'), 0.3857142857 + ) + + # Examples from paper + self.assertAlmostEqual( + self.cmp.sim('Stevenson', 'Stinson'), 0.551724138 + ) + + def test_saps_dist(self): + """Test abydos.distance.SAPS.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9333333333) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5666666667 + ) + + # Coverage + self.assertAlmostEqual( + self.cmp_q2.dist('Stevenson', 'Stinson'), 0.614285714 + ) + + # Examples from paper + self.assertAlmostEqual( + self.cmp.dist('Stevenson', 'Stinson'), 0.448275862 + ) + + def test_saps_sim_score(self): + """Test abydos.distance.SAPS.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0) + self.assertEqual(self.cmp.sim_score('a', ''), -3) + self.assertEqual(self.cmp.sim_score('', 'a'), -3) + self.assertEqual(self.cmp.sim_score('abc', ''), -7) + self.assertEqual(self.cmp.sim_score('', 'abc'), -7) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 13) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), -7) + + self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 1) + self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 1) + self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 1) + self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 1) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 13 + ) + + # Coverage + self.assertEqual(self.cmp_q2.sim_score('Stevenson', 'Stinson'), 27) + + # Examples from paper + self.assertEqual(self.cmp.sim_score('Stevenson', 'Stinson'), 16) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_scott_pi.py b/tests/distance/test_distance_scott_pi.py new file mode 100644 index 000000000..ab15a93a4 --- /dev/null +++ b/tests/distance/test_distance_scott_pi.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_scott_pi. + +This module contains unit tests for abydos.distance.ScottPi +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ScottPi +from abydos.tokenizer import QSkipgrams + + +class ScottPiTestCases(unittest.TestCase): + """Test ScottPi functions. + + abydos.distance.ScottPi + """ + + cmp = ScottPi() + cmp_no_d = ScottPi(alphabet=0) + + def test_scott_pi_sim(self): + """Test abydos.distance.ScottPi.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.4993614303959431) + self.assertEqual(self.cmp.sim('', 'a'), 0.4993614303959431) + self.assertEqual(self.cmp.sim('abc', ''), 0.49872122762147786) + self.assertEqual(self.cmp.sim('', 'abc'), 0.49872122762147786) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.49679075738125517) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8310708899 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + def test_scott_pi_dist(self): + """Test abydos.distance.ScottPi.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.500638569604057) + self.assertEqual(self.cmp.dist('', 'a'), 0.500638569604057) + self.assertEqual(self.cmp.dist('abc', ''), 0.5012787723785221) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5012787723785221) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5032092426187449) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1689291101 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + def test_scott_pi_corr(self): + """Test abydos.distance.ScottPi.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -0.0012771392081137526) + self.assertEqual(self.cmp.corr('', 'a'), -0.0012771392081137526) + self.assertEqual(self.cmp.corr('abc', ''), -0.0025575447570442954) + self.assertEqual(self.cmp.corr('', 'abc'), -0.0025575447570442954) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237489689) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6621417798 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3333333333 + ) + + self.assertEqual( + ScottPi( + alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK') + ).corr('eh', 'a'), + 0.0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_shape.py b/tests/distance/test_distance_shape.py new file mode 100644 index 000000000..3e3e19051 --- /dev/null +++ b/tests/distance/test_distance_shape.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_shape. + +This module contains unit tests for abydos.distance.Shape +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Shape + + +class ShapeTestCases(unittest.TestCase): + """Test Shape functions. + + abydos.distance.Shape + """ + + cmp = Shape() + cmp_no_d = Shape(alphabet=0) + + def test_shape_dist(self): + """Test abydos.distance.Shape.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0025445127030404) + self.assertEqual(self.cmp.dist('', 'a'), 0.0025445127030404) + self.assertEqual(self.cmp.dist('abc', ''), 0.005076009995835068) + self.assertEqual(self.cmp.dist('', 'abc'), 0.005076009995835068) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.01259240941274469) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0075944919) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0075944919) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0075944919) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0075944919) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.008848852 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.2222222222 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.25 + ) + + def test_shape_sim(self): + """Test abydos.distance.Shape.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9974554872969597) + self.assertEqual(self.cmp.sim('', 'a'), 0.9974554872969597) + self.assertEqual(self.cmp.sim('abc', ''), 0.994923990004165) + self.assertEqual(self.cmp.sim('', 'abc'), 0.994923990004165) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9874075905872554) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9924055081) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9924055081) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9924055081) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9924055081) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.991151148 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.7777777778 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.75 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_shapira_storer_i.py b/tests/distance/test_distance_shapira_storer_i.py new file mode 100644 index 000000000..fb3aa3ddd --- /dev/null +++ b/tests/distance/test_distance_shapira_storer_i.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_shapira_storer_i. + +This module contains unit tests for abydos.distance.ShapiraStorerI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import ShapiraStorerI + + +class ShapiraStorerITestCases(unittest.TestCase): + """Test ShapiraStorerI functions. + + abydos.distance.ShapiraStorerI + """ + + cmp = ShapiraStorerI() + cmp_prime = ShapiraStorerI(prime=True) + + def test_shapira_storer_i_dist(self): + """Test abydos.distance.ShapiraStorerI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2105263158 + ) + + self.assertAlmostEqual( + self.cmp.dist('AABAACADAB', 'AABAABAACADABADABAABAABAACADABADAB'), + 0.3409090909090909, + ) + self.assertAlmostEqual( + self.cmp_prime.dist( + 'AABAACADAB', 'AABAABAACADABADABAABAABAACADABADAB' + ), + 0.5454545454545454, + ) + self.assertAlmostEqual( + self.cmp.dist('AABAABAACADABADABAABAABAACADABADAB', 'AABAACADAB'), + 0.3409090909090909, + ) + self.assertAlmostEqual( + self.cmp_prime.dist( + 'AABAABAACADABADABAABAABAACADABADAB', 'AABAACADAB' + ), + 0.5454545454545454, + ) + + def test_shapira_storer_i_sim(self): + """Test abydos.distance.ShapiraStorerI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7894736842 + ) + + self.assertAlmostEqual( + self.cmp.sim('AABAACADAB', 'AABAABAACADABADABAABAABAACADABADAB'), + 0.6590909090909092, + ) + self.assertAlmostEqual( + self.cmp_prime.sim( + 'AABAACADAB', 'AABAABAACADABADABAABAABAACADABADAB' + ), + 0.4545454545454546, + ) + self.assertAlmostEqual( + self.cmp.sim('AABAABAACADABADABAABAABAACADABADAB', 'AABAACADAB'), + 0.6590909090909092, + ) + self.assertAlmostEqual( + self.cmp_prime.sim( + 'AABAABAACADABADABAABAABAACADABADAB', 'AABAACADAB' + ), + 0.4545454545454546, + ) + + def test_shapira_storer_i_dist_abs(self): + """Test abydos.distance.ShapiraStorerI.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abc', ''), 3) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 8) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 4) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 4) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4) + + self.assertAlmostEqual( + self.cmp.dist_abs( + 'AABAACADAB', 'AABAABAACADABADABAABAABAACADABADAB' + ), + 15, + ) + self.assertAlmostEqual( + self.cmp_prime.dist_abs( + 'AABAACADAB', 'AABAABAACADABADABAABAABAACADABADAB' + ), + 24, + ) + self.assertAlmostEqual( + self.cmp.dist_abs( + 'AABAABAACADABADABAABAABAACADABADAB', 'AABAACADAB' + ), + 15, + ) + self.assertAlmostEqual( + self.cmp_prime.dist_abs( + 'AABAABAACADABADABAABAABAACADABADAB', 'AABAACADAB' + ), + 24, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sift4.py b/tests/distance/test_distance_sift4.py index 1b1f38b12..8d3b115c9 100644 --- a/tests/distance/test_distance_sift4.py +++ b/tests/distance/test_distance_sift4.py @@ -40,6 +40,7 @@ class Sift4TestCases(unittest.TestCase): """ cmp = Sift4() + cmp55 = Sift4(5, 5) def test_sift4_dist_abs(self): """Test abydos.distance.Sift4.dist_abs.""" @@ -79,29 +80,25 @@ def test_sift4_dist_abs(self): # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertEqual( - self.cmp.dist_abs( - 'This is the first string', 'And this is another string', 5 + Sift4(5).dist_abs( + 'This is the first string', 'And this is another string' ), 11, ) self.assertEqual( - self.cmp.dist_abs( - 'Lorem ipsum dolor sit amet, ' - + 'consectetur adipiscing elit.', - 'Amet Lorm ispum dolor sit amet, ' - + 'consetetur adixxxpiscing elit.', - 10, + Sift4(10).dist_abs( + 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', + 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + + ' elit.', ), 12, ) # cases with max_distance - self.assertEqual(self.cmp.dist_abs('example', 'samples', 5, 5), 5) - self.assertEqual(self.cmp.dist_abs('sturgeon', 'urgently', 5, 5), 5) - self.assertEqual( - self.cmp.dist_abs('levenshtein', 'frankenstein', 5, 5), 5 - ) - self.assertEqual(self.cmp.dist_abs('distance', 'difference', 5, 5), 5) + self.assertEqual(self.cmp55.dist_abs('example', 'samples'), 5) + self.assertEqual(self.cmp55.dist_abs('sturgeon', 'urgently'), 5) + self.assertEqual(self.cmp55.dist_abs('levenshtein', 'frankenstein'), 5) + self.assertEqual(self.cmp55.dist_abs('distance', 'difference'), 5) # Test wrapper self.assertEqual(sift4_common('xabxcdxxefxgx', 'abcdefg'), 7) @@ -152,35 +149,29 @@ def test_sift4_dist(self): # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertAlmostEqual( - self.cmp.dist( - 'This is the first string', 'And this is another string', 5 + Sift4(5).dist( + 'This is the first string', 'And this is another string' ), 0.423076923, ) self.assertAlmostEqual( - self.cmp.dist( - 'Lorem ipsum dolor sit amet, ' - + 'consectetur adipiscing elit.', - 'Amet Lorm ispum dolor sit amet, ' - + 'consetetur adixxxpiscing elit.', - 10, + Sift4(10).dist( + 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', + 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + + ' elit.', ), 0.193548387, ) # cases with max_distance self.assertAlmostEqual( - self.cmp.dist('example', 'samples', 5, 5), 0.714285714 - ) - self.assertAlmostEqual( - self.cmp.dist('sturgeon', 'urgently', 5, 5), 0.625 - ) - self.assertAlmostEqual( - self.cmp.dist('levenshtein', 'frankenstein', 5, 5), 0.416666666 + self.cmp55.dist('example', 'samples'), 0.714285714 ) + self.assertAlmostEqual(self.cmp55.dist('sturgeon', 'urgently'), 0.625) self.assertAlmostEqual( - self.cmp.dist('distance', 'difference', 5, 5), 0.5 + self.cmp55.dist('levenshtein', 'frankenstein'), 0.416666666 ) + self.assertAlmostEqual(self.cmp55.dist('distance', 'difference'), 0.5) # Test wrapper self.assertAlmostEqual( @@ -231,35 +222,29 @@ def test_sift4_sim(self): # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertAlmostEqual( - self.cmp.sim( - 'This is the first string', 'And this is another string', 5 + Sift4(5).sim( + 'This is the first string', 'And this is another string' ), 0.576923077, ) self.assertAlmostEqual( - self.cmp.sim( - 'Lorem ipsum dolor sit amet, ' - + 'consectetur adipiscing elit.', - 'Amet Lorm ispum dolor sit amet, ' - + 'consetetur adixxxpiscing elit.', - 10, + Sift4(10).sim( + 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', + 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + + ' elit.', ), 0.806451613, ) # cases with max_distance self.assertAlmostEqual( - self.cmp.sim('example', 'samples', 5, 5), 0.285714286 - ) - self.assertAlmostEqual( - self.cmp.sim('sturgeon', 'urgently', 5, 5), 0.375 - ) - self.assertAlmostEqual( - self.cmp.sim('levenshtein', 'frankenstein', 5, 5), 0.583333333 + self.cmp55.sim('example', 'samples'), 0.285714286 ) + self.assertAlmostEqual(self.cmp55.sim('sturgeon', 'urgently'), 0.375) self.assertAlmostEqual( - self.cmp.sim('distance', 'difference', 5, 5), 0.5 + self.cmp55.sim('levenshtein', 'frankenstein'), 0.583333333 ) + self.assertAlmostEqual(self.cmp55.sim('distance', 'difference'), 0.5) # Test wrapper self.assertAlmostEqual( diff --git a/tests/distance/test_distance_sift4_extended.py b/tests/distance/test_distance_sift4_extended.py new file mode 100644 index 000000000..b9615b49f --- /dev/null +++ b/tests/distance/test_distance_sift4_extended.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sift4_extended. + +This module contains unit tests for abydos.distance.Sift4Extended +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Sift4Extended +from abydos.tokenizer import QGrams + + +class Sift4ExtendedTestCases(unittest.TestCase): + """Test Sift4Extended functions. + + abydos.distance.Sift4Extended + """ + + ltamc = Sift4Extended.longer_transpositions_are_more_costly + + cmp = Sift4Extended() + cmp_kwargs = Sift4Extended( + tokenizer=QGrams(qval=2), + token_matcher=Sift4Extended.sift4_token_matcher, + matching_evaluator=Sift4Extended.sift4_matching_evaluator, + local_length_evaluator=Sift4Extended.reward_length_evaluator, + transposition_cost_evaluator=ltamc, + transpositions_evaluator=lambda lcss, trans: lcss - trans, + ) + cmp_kwargs2 = Sift4Extended( + local_length_evaluator=Sift4Extended.reward_length_evaluator_exp + ) + cmp_md = Sift4Extended(max_distance=3) + + def test_sift4_extended_dist_abs(self): + """Test abydos.distance.Sift4Extended.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abc', ''), 3) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4) + + self.assertEqual(self.cmp_kwargs.dist_abs('', ''), 0) + self.assertEqual(self.cmp_kwargs.dist_abs('a', ''), 2) + self.assertEqual(self.cmp_kwargs.dist_abs('', 'a'), 2) + self.assertEqual(self.cmp_kwargs.dist_abs('abc', ''), 4) + self.assertEqual(self.cmp_kwargs.dist_abs('', 'abc'), 4) + self.assertEqual(self.cmp_kwargs.dist_abs('abc', 'abc'), -1) + self.assertEqual(self.cmp_kwargs.dist_abs('abcd', 'efgh'), -2) + + self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Nigel', 'Niall'), 1) + self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Niall', 'Nigel'), 1) + self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Colin', 'Coiln'), 1) + self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Coiln', 'Colin'), 1) + self.assertAlmostEqual( + self.cmp_kwargs.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2 + ) + + self.assertEqual(self.cmp_kwargs2.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp_kwargs2.dist_abs('abcd', 'efgh'), 8) + + self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Nigel', 'Niall'), 7) + self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Niall', 'Nigel'), 7) + self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Colin', 'Coiln'), 6) + self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Coiln', 'Colin'), 6) + self.assertAlmostEqual( + self.cmp_kwargs2.dist_abs('ATCAACGAGT', 'AACGATTAG'), 25 + ) + + # coverage completion + self.assertAlmostEqual( + self.cmp_kwargs.dist_abs('beaurocracy', 'bureaucracy'), 3 + ) + self.assertAlmostEqual( + self.cmp_md.dist_abs('beaurocratically', 'bureaucracy'), 3 + ) + self.assertAlmostEqual( + self.cmp_md.dist_abs('bureaucracy', 'bureaucracy'), 3 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sift4_simplest.py b/tests/distance/test_distance_sift4_simplest.py index a2f4192ba..68e54aaca 100644 --- a/tests/distance/test_distance_sift4_simplest.py +++ b/tests/distance/test_distance_sift4_simplest.py @@ -79,18 +79,16 @@ def test_sift4_simplest_dist_abs(self): # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertEqual( - self.cmp.dist_abs( - 'This is the first string', 'And this is another string', 5 + Sift4Simplest(5).dist_abs( + 'This is the first string', 'And this is another string' ), 13, ) self.assertEqual( - self.cmp.dist_abs( - 'Lorem ipsum dolor sit amet, ' - + 'consectetur adipiscing elit.', - 'Amet Lorm ispum dolor sit amet, ' - + 'consetetur adixxxpiscing elit.', - 10, + Sift4Simplest(10).dist_abs( + 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', + 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + + ' elit.', ), 20, ) diff --git a/tests/distance/test_distance_single_linkage.py b/tests/distance/test_distance_single_linkage.py new file mode 100644 index 000000000..e2a0a0c36 --- /dev/null +++ b/tests/distance/test_distance_single_linkage.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_single_linkage. + +This module contains unit tests for abydos.distance.SingleLinkage +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import JaroWinkler, SingleLinkage + + +class SingleLinkageTestCases(unittest.TestCase): + """Test SingleLinkage functions. + + abydos.distance.SingleLinkage + """ + + cmp = SingleLinkage() + cmp_jw = SingleLinkage(metric=JaroWinkler()) + + def test_single_linkage_dist(self): + """Test abydos.distance.SingleLinkage.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0) + + def test_single_linkage_sim(self): + """Test abydos.distance.SingleLinkage.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 1.0) + + def test_single_linkage_dist_abs(self): + """Test abydos.distance.SingleLinkage.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), float('inf')) + self.assertEqual(self.cmp.dist_abs('a', ''), float('inf')) + self.assertEqual(self.cmp.dist_abs('', 'a'), float('inf')) + self.assertEqual(self.cmp.dist_abs('abc', ''), float('inf')) + self.assertEqual(self.cmp.dist_abs('', 'abc'), float('inf')) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 1) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 0) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 0) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 0) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 0) + self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0) + + self.assertAlmostEqual(self.cmp_jw.dist_abs('abcd', 'dj'), 1 / 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_size.py b/tests/distance/test_distance_size.py new file mode 100644 index 000000000..c4a2e8b1e --- /dev/null +++ b/tests/distance/test_distance_size.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_size. + +This module contains unit tests for abydos.distance.Size +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Size + + +class SizeTestCases(unittest.TestCase): + """Test Size functions. + + abydos.distance.Size + """ + + cmp = Size() + cmp_no_d = Size(alphabet=0) + + def test_size_dist(self): + """Test abydos.distance.Size.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 6.507705122865473e-06) + self.assertEqual(self.cmp.dist('', 'a'), 6.507705122865473e-06) + self.assertEqual(self.cmp.dist('abc', ''), 2.6030820491461892e-05) + self.assertEqual(self.cmp.dist('', 'abc'), 2.6030820491461892e-05) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.0001626926280716368) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 5.85693e-05) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 5.85693e-05) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 5.85693e-05) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 5.85693e-05) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 7.97194e-05 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.4444444444 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.25 + ) + + def test_size_sim(self): + """Test abydos.distance.Size.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9999934922948771) + self.assertEqual(self.cmp.sim('', 'a'), 0.9999934922948771) + self.assertEqual(self.cmp.sim('abc', ''), 0.9999739691795085) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9999739691795085) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9998373073719283) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9999414307) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9999414307) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9999414307) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9999414307) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9999202806 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.5555555556 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.75 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_smith_waterman.py b/tests/distance/test_distance_smith_waterman.py index c63876f07..9eb7806b5 100644 --- a/tests/distance/test_distance_smith_waterman.py +++ b/tests/distance/test_distance_smith_waterman.py @@ -44,29 +44,25 @@ class SmithWatermanTestCases(unittest.TestCase): abydos.distance.SmithWaterman """ - cmp = SmithWaterman() - def test_smith_waterman_dist_abs(self): """Test abydos.distance.SmithWaterman.dist_abs.""" - self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(SmithWaterman().dist_abs('', ''), 0) # https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm self.assertEqual( - self.cmp.dist_abs('GATTACA', 'GCATGCU', 1, _sim_nw), 0 + SmithWaterman(1, _sim_nw).dist_abs('GATTACA', 'GCATGCU'), 0 ) self.assertEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, _sim_wikipedia), 26 + SmithWaterman(5, _sim_wikipedia).dist_abs( + 'AGACTAGTTAC', 'CGAGACGT' + ), + 26, ) - self.assertEqual( - self.cmp.dist_abs('CGATATCAG', 'TGACGSTGC', 5, _sim_nw), 0 - ) - self.assertEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'TGACGSTGC', 5, _sim_nw), 1 - ) - self.assertEqual( - self.cmp.dist_abs('AGACTAGTTAC', 'CGAGACGT', 5, _sim_nw), 0 - ) + sw5 = SmithWaterman(5, _sim_nw) + self.assertEqual(sw5.dist_abs('CGATATCAG', 'TGACGSTGC'), 0) + self.assertEqual(sw5.dist_abs('AGACTAGTTAC', 'TGACGSTGC'), 1) + self.assertEqual(sw5.dist_abs('AGACTAGTTAC', 'CGAGACGT'), 0) # Test wrapper self.assertEqual( @@ -76,10 +72,9 @@ def test_smith_waterman_dist_abs(self): def test_smith_waterman_dist_abs_nialls(self): """Test abydos.distance.SmithWaterman.dist_abs (Nialls set).""" sw_vals = (5, 1, 1, 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 1, 0, 0) + sw2 = SmithWaterman(2, _sim_nw) for i in range(len(NIALL)): - self.assertEqual( - self.cmp.dist_abs(NIALL[0], NIALL[i], 2, _sim_nw), sw_vals[i] - ) + self.assertEqual(sw2.dist_abs(NIALL[0], NIALL[i]), sw_vals[i]) if __name__ == '__main__': diff --git a/tests/distance/test_distance_soft_cosine.py b/tests/distance/test_distance_soft_cosine.py new file mode 100644 index 000000000..25d2de1e4 --- /dev/null +++ b/tests/distance/test_distance_soft_cosine.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_soft_cosine. + +This module contains unit tests for abydos.distance.SoftCosine +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SoftCosine + + +class SoftCosineTestCases(unittest.TestCase): + """Test SoftCosine functions. + + abydos.distance.SoftCosine + """ + + cmp = SoftCosine() + cmp_b = SoftCosine(sim_method='b') + cmp_c = SoftCosine(sim_method='c') + cmp_d = SoftCosine(sim_method='d') + + def test_soft_cosine_sim(self): + """Test abydos.distance.SoftCosine.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.7428571428571427) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.898146239) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.898146239) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9375) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9375) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9731507012 + ) + + # Constructor exception + with self.assertRaises(ValueError): + SoftCosine(sim_method='e') + + # Alternate sim_methods + # Base cases + self.assertEqual(self.cmp_b.sim('', ''), 1.0) + self.assertEqual(self.cmp_b.sim('a', ''), 0.0) + self.assertEqual(self.cmp_b.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_b.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_b.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_b.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp_b.sim('abcd', 'efgh'), 0.2) + + self.assertAlmostEqual(self.cmp_b.sim('Nigel', 'Niall'), 0.721687836) + self.assertAlmostEqual(self.cmp_b.sim('Niall', 'Nigel'), 0.721687836) + self.assertAlmostEqual(self.cmp_b.sim('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_b.sim('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_b.sim('ATCAACGAGT', 'AACGATTAG'), 0.98328200498 + ) + + # Base cases + self.assertEqual(self.cmp_c.sim('', ''), 1.0) + self.assertEqual(self.cmp_c.sim('a', ''), 0.0) + self.assertEqual(self.cmp_c.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_c.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_c.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_c.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp_c.sim('abcd', 'efgh'), 0.2828427124746) + + self.assertAlmostEqual(self.cmp_c.sim('Nigel', 'Niall'), 0.800818463) + self.assertAlmostEqual(self.cmp_c.sim('Niall', 'Nigel'), 0.800818463) + self.assertAlmostEqual(self.cmp_c.sim('Colin', 'Coiln'), 1.207106781) + self.assertAlmostEqual(self.cmp_c.sim('Coiln', 'Colin'), 1.207106781) + self.assertAlmostEqual( + self.cmp_c.sim('ATCAACGAGT', 'AACGATTAG'), 1.023072064 + ) + + # Base cases + self.assertEqual(self.cmp_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_d.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual(self.cmp_d.sim('abcd', 'efgh'), 0.1) + + self.assertAlmostEqual(self.cmp_d.sim('Nigel', 'Niall'), 0.6172133998) + self.assertAlmostEqual(self.cmp_d.sim('Niall', 'Nigel'), 0.6172133998) + self.assertAlmostEqual(self.cmp_d.sim('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_d.sim('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.89597867038 + ) + + def test_soft_cosine_dist(self): + """Test abydos.distance.SoftCosine.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp.dist('abcd', 'efgh'), 0.25714285714285734 + ) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.101853761) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.101853761) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0625) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0625) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0268492988 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_softtf_idf.py b/tests/distance/test_distance_softtf_idf.py new file mode 100644 index 000000000..ab58a27f0 --- /dev/null +++ b/tests/distance/test_distance_softtf_idf.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_softtf_idf. + +This module contains unit tests for abydos.distance.SoftTFIDF +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import os +import unittest + +from abydos.corpus import UnigramCorpus +from abydos.distance import Levenshtein, SoftTFIDF +from abydos.tokenizer import QGrams +from abydos.util import download_package, package_path + +from six import PY2 + + +class SoftTFIDFTestCases(unittest.TestCase): + """Test SoftTFIDF functions. + + abydos.distance.SoftTFIDF + """ + + cmp = SoftTFIDF() + cmp_lev = SoftTFIDF(metric=Levenshtein()) + + def test_softtf_idf_sim(self): + """Test abydos.distance.SoftTFIDF.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.304044497) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.304044497) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.304044497) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.304044497) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4676712137 + ) + + self.assertAlmostEqual(self.cmp_lev.sim('Nigel', 'Niall'), 0.304044497) + self.assertAlmostEqual(self.cmp_lev.sim('Niall', 'Nigel'), 0.304044497) + self.assertAlmostEqual(self.cmp_lev.sim('Colin', 'Coiln'), 0.304044497) + self.assertAlmostEqual(self.cmp_lev.sim('Coiln', 'Colin'), 0.304044497) + self.assertAlmostEqual( + self.cmp_lev.sim('ATCAACGAGT', 'AACGATTAG'), 0.4676712137 + ) + + def test_softtf_idf_dist(self): + """Test abydos.distance.SoftTFIDF.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.695955503) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.695955503) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.695955503) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.695955503) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5323287863 + ) + + def test_softtf_idf_corpus(self): + """Test abydos.distance.SoftTFIDF.sim & .dist with corpus.""" + if PY2: # disable testing in Py2.7; the pickled data isn't supported + return + + download_package('en_qgram', silent=True) + + q3_corpus = UnigramCorpus(word_tokenizer=QGrams(qval=3)) + q3_corpus.load_corpus( + os.path.join(package_path('en_qgram'), 'q3_en.dat') + ) + cmp_q3_08 = SoftTFIDF( + tokenizer=QGrams(qval=3), corpus=q3_corpus, threshold=0.8 + ) + cmp_q3_03 = SoftTFIDF( + tokenizer=QGrams(qval=3), corpus=q3_corpus, threshold=0.3 + ) + + self.assertAlmostEqual(cmp_q3_08.sim('Nigel', 'Niall'), 0.608842672) + self.assertAlmostEqual(cmp_q3_08.sim('Niall', 'Nigel'), 0.608842672) + self.assertAlmostEqual(cmp_q3_08.sim('Colin', 'Coiln'), 0.383052250) + self.assertAlmostEqual(cmp_q3_08.sim('Coiln', 'Colin'), 0.383052250) + + # These values won't be stable, so we just use Greater/Less + self.assertGreater(cmp_q3_03.sim('Nigel', 'Niall'), 0.5) + self.assertGreater(cmp_q3_03.sim('Niall', 'Nigel'), 0.5) + self.assertGreater(cmp_q3_03.sim('Colin', 'Coiln'), 0.5) + self.assertGreater(cmp_q3_03.sim('Coiln', 'Colin'), 0.5) + + self.assertAlmostEqual(cmp_q3_08.dist('Nigel', 'Niall'), 0.391157328) + self.assertAlmostEqual(cmp_q3_08.dist('Niall', 'Nigel'), 0.391157328) + self.assertAlmostEqual(cmp_q3_08.dist('Colin', 'Coiln'), 0.616947750) + self.assertAlmostEqual(cmp_q3_08.dist('Coiln', 'Colin'), 0.616947750) + + # These values won't be stable, so we just use Greater/Less + self.assertLess(cmp_q3_03.dist('Nigel', 'Niall'), 0.5) + self.assertLess(cmp_q3_03.dist('Niall', 'Nigel'), 0.5) + self.assertLess(cmp_q3_03.dist('Colin', 'Coiln'), 0.5) + self.assertLess(cmp_q3_03.dist('Coiln', 'Colin'), 0.5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sokal_michener.py b/tests/distance/test_distance_sokal_michener.py new file mode 100644 index 000000000..3214dda1d --- /dev/null +++ b/tests/distance/test_distance_sokal_michener.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sokal_michener. + +This module contains unit tests for abydos.distance.SokalMichener +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SokalMichener + + +class SokalMichenerTestCases(unittest.TestCase): + """Test SokalMichener functions. + + abydos.distance.SokalMichener + """ + + cmp = SokalMichener() + cmp_no_d = SokalMichener(alphabet=0) + + def test_sokal_michener_sim(self): + """Test abydos.distance.SokalMichener.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9974489795918368) + self.assertEqual(self.cmp.sim('', 'a'), 0.9974489795918368) + self.assertEqual(self.cmp.sim('abc', ''), 0.9948979591836735) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9872448979591837) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923469388) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9910714286 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_sokal_michener_dist(self): + """Test abydos.distance.SokalMichener.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0025510204081632404) + self.assertEqual(self.cmp.dist('', 'a'), 0.0025510204081632404) + self.assertEqual(self.cmp.dist('abc', ''), 0.005102040816326481) + self.assertEqual(self.cmp.dist('', 'abc'), 0.005102040816326481) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816313) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076530612) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0089285714 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sokal_sneath_i.py b/tests/distance/test_distance_sokal_sneath_i.py new file mode 100644 index 000000000..91d404625 --- /dev/null +++ b/tests/distance/test_distance_sokal_sneath_i.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sokal_sneath_i. + +This module contains unit tests for abydos.distance.SokalSneathI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SokalSneathI + + +class SokalSneathITestCases(unittest.TestCase): + """Test SokalSneathI functions. + + abydos.distance.SokalSneathI + """ + + cmp = SokalSneathI() + cmp_no_d = SokalSneathI(alphabet=0) + + def test_sokal_sneath_i_sim(self): + """Test abydos.distance.SokalSneathI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987228607918263) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987228607918263) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974424552429667) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974424552429667) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.993581514762516) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961587708) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961587708) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961587708) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961587708) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9955156951 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + def test_sokal_sneath_i_dist(self): + """Test abydos.distance.SokalSneathI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('', 'a'), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('abc', ''), 0.002557544757033292) + self.assertEqual(self.cmp.dist('', 'abc'), 0.002557544757033292) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.006418485237484006) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038412292) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038412292) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038412292) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038412292) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0044843049 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sokal_sneath_ii.py b/tests/distance/test_distance_sokal_sneath_ii.py new file mode 100644 index 000000000..ea9602886 --- /dev/null +++ b/tests/distance/test_distance_sokal_sneath_ii.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sokal_sneath_ii. + +This module contains unit tests for abydos.distance.SokalSneathII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SokalSneathII + + +class SokalSneathIITestCases(unittest.TestCase): + """Test SokalSneathII functions. + + abydos.distance.SokalSneathII + """ + + cmp = SokalSneathII() + + def test_sokal_sneath_ii_sim(self): + """Test abydos.distance.SokalSneathII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + def test_sokal_sneath_ii_dist(self): + """Test abydos.distance.SokalSneathII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sokal_sneath_iii.py b/tests/distance/test_distance_sokal_sneath_iii.py new file mode 100644 index 000000000..9689b27da --- /dev/null +++ b/tests/distance/test_distance_sokal_sneath_iii.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sokal_sneath_iii. + +This module contains unit tests for abydos.distance.SokalSneathIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SokalSneathIII + + +class SokalSneathIIITestCases(unittest.TestCase): + """Test SokalSneathIII functions. + + abydos.distance.SokalSneathIII + """ + + cmp = SokalSneathIII() + + def test_sokal_sneath_iii_sim_score(self): + """Test abydos.distance.SokalSneathIII.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), float('inf')) + self.assertEqual(self.cmp.sim_score('a', ''), 391.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 391.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 195.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 195.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), float('inf')) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 77.4) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 129.6666666667 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 129.6666666667 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 129.6666666667 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 129.6666666667 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 111.0 + ) + + def test_sokal_sneath_iii_dist(self): + """Test abydos.distance.SokalSneathIII.dist.""" + self.assertRaises(NotImplementedError, self.cmp.dist) + + def test_sokal_sneath_iii_sim(self): + """Test abydos.distance.SokalSneathIII.sim.""" + self.assertRaises(NotImplementedError, self.cmp.sim) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sokal_sneath_iv.py b/tests/distance/test_distance_sokal_sneath_iv.py new file mode 100644 index 000000000..fc86014c4 --- /dev/null +++ b/tests/distance/test_distance_sokal_sneath_iv.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sokal_sneath_iv. + +This module contains unit tests for abydos.distance.SokalSneathIV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SokalSneathIV + + +class SokalSneathIVTestCases(unittest.TestCase): + """Test SokalSneathIV functions. + + abydos.distance.SokalSneathIV + """ + + cmp = SokalSneathIV() + cmp_no_d = SokalSneathIV(alphabet=0) + + def test_sokal_sneath_iv_sim(self): + """Test abydos.distance.SokalSneathIV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.4993622448979592) + self.assertEqual(self.cmp.sim('', 'a'), 0.4993622448979592) + self.assertEqual(self.cmp.sim('abc', ''), 0.4987244897959184) + self.assertEqual(self.cmp.sim('', 'abc'), 0.4987244897959184) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8318286736 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3340909091 + ) + + def test_sokal_sneath_iv_dist(self): + """Test abydos.distance.SokalSneathIV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5006377551020408) + self.assertEqual(self.cmp.dist('', 'a'), 0.5006377551020408) + self.assertEqual(self.cmp.dist('abc', ''), 0.5012755102040816) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5012755102040816) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1681713264 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6659090909 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sokal_sneath_v.py b/tests/distance/test_distance_sokal_sneath_v.py new file mode 100644 index 000000000..87d06902b --- /dev/null +++ b/tests/distance/test_distance_sokal_sneath_v.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sokal_sneath_v. + +This module contains unit tests for abydos.distance.SokalSneathV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import SokalSneathV + + +class SokalSneathVTestCases(unittest.TestCase): + """Test SokalSneathV functions. + + abydos.distance.SokalSneathV + """ + + cmp = SokalSneathV() + cmp_no_d = SokalSneathV(alphabet=0) + + def test_sokal_sneath_v_sim(self): + """Test abydos.distance.SokalSneathV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.4980719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.4980719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.4980719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.4980719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.664403934 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_sokal_sneath_v_dist(self): + """Test abydos.distance.SokalSneathV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5019280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5019280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5019280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5019280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.335596066 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_sorgenfrei.py b/tests/distance/test_distance_sorgenfrei.py new file mode 100644 index 000000000..5a1167e87 --- /dev/null +++ b/tests/distance/test_distance_sorgenfrei.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_sorgenfrei. + +This module contains unit tests for abydos.distance.Sorgenfrei +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Sorgenfrei + + +class SorgenfreiTestCases(unittest.TestCase): + """Test Sorgenfrei functions. + + abydos.distance.Sorgenfrei + """ + + cmp = Sorgenfrei() + cmp_no_d = Sorgenfrei(alphabet=0) + + def test_sorgenfrei_sim(self): + """Test abydos.distance.Sorgenfrei.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4454545455 + ) + + def test_sorgenfrei_dist(self): + """Test abydos.distance.Sorgenfrei.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5545454545 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_steffensen.py b/tests/distance/test_distance_steffensen.py new file mode 100644 index 000000000..ee1bbc140 --- /dev/null +++ b/tests/distance/test_distance_steffensen.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_steffensen. + +This module contains unit tests for abydos.distance.Steffensen +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Steffensen + + +class SteffensenTestCases(unittest.TestCase): + """Test Steffensen functions. + + abydos.distance.Steffensen + """ + + cmp = Steffensen() + cmp_no_d = Steffensen(alphabet=0) + + def test_steffensen_sim(self): + """Test abydos.distance.Steffensen.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 4.1196952743871653e-05) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2461588279) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.439469213 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1090909091 + ) + + def test_steffensen_dist(self): + """Test abydos.distance.Steffensen.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9999588030472562) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7538411721) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.560530787 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8909090909 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_stiles.py b/tests/distance/test_distance_stiles.py new file mode 100644 index 000000000..9fd394705 --- /dev/null +++ b/tests/distance/test_distance_stiles.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_stiles. + +This module contains unit tests for abydos.distance.Stiles +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Stiles + + +class StilesTestCases(unittest.TestCase): + """Test Stiles functions. + + abydos.distance.Stiles + """ + + cmp = Stiles() + cmp_no_d = Stiles(alphabet=0) + + def test_stiles_sim(self): + """Test abydos.distance.Stiles.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.760700314616495) + self.assertEqual(self.cmp.sim('', 'a'), 0.760700314616495) + self.assertEqual(self.cmp.sim('abc', ''), 0.7416916584588271) + self.assertEqual(self.cmp.sim('', 'abc'), 0.7416916584588271) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.4768516719017855) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5744293838) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5744293838) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5744293838) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5744293838) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5909028826 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertAlmostEqual(self.cmp_no_d.sim('a', ''), 0.9511587063686434) + self.assertAlmostEqual(self.cmp_no_d.sim('', 'a'), 0.9511587063686434) + self.assertAlmostEqual( + self.cmp_no_d.sim('abc', ''), 0.9309340273884292 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('', 'abc'), 0.9309340273884292 + ) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('abcd', 'efgh'), 0.47481536969259386 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.5216609379 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.5216609379 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.5216609379 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.5216609379 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5532905837 + ) + + def test_stiles_dist(self): + """Test abydos.distance.Stiles.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.239299685383505) + self.assertEqual(self.cmp.dist('', 'a'), 0.239299685383505) + self.assertEqual(self.cmp.dist('abc', ''), 0.2583083415411729) + self.assertEqual(self.cmp.dist('', 'abc'), 0.2583083415411729) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5231483280982145) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4255706162) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4255706162) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4255706162) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4255706162) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4090971174 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('a', ''), 0.048841293631356586 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('', 'a'), 0.048841293631356586 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('abc', ''), 0.06906597261157077 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('', 'abc'), 0.06906597261157077 + ) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('abcd', 'efgh'), 0.5251846303074061 + ) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.4783390621 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.4783390621 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.4783390621 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.4783390621 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.4467094163 + ) + + def test_stiles_sim_score(self): + """Test abydos.distance.Stiles.sim_score.""" + # Base cases + self.assertAlmostEqual(self.cmp.sim_score('', ''), 16.292255897915638) + self.assertAlmostEqual(self.cmp.sim_score('a', ''), 8.992335212208333) + self.assertAlmostEqual(self.cmp.sim_score('', 'a'), 8.992335212208333) + self.assertAlmostEqual( + self.cmp.sim_score('abc', ''), 8.692417367356594 + ) + self.assertAlmostEqual( + self.cmp.sim_score('', 'abc'), 8.692417367356594 + ) + self.assertAlmostEqual( + self.cmp.sim_score('abc', 'abc'), 17.98245215160657 + ) + self.assertAlmostEqual( + self.cmp.sim_score('abcd', 'efgh'), -0.8426334527850912 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 2.7352860243 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 2.7352860243 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 2.7352860243 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 2.7352860243 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 3.4428002638 + ) + + # Tests with alphabet=0 (no d factor) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('', ''), 13.647817481888637 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('a', ''), 13.22184890168726 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('', 'a'), 13.22184890168726 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('abc', ''), 13.522878821349728 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('', 'abc'), 13.522878821349728 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('abc', 'abc'), 15.69019612345796 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('abcd', 'efgh'), -0.8061799153541304 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), 0.7043650362 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), 0.7043650362 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), 0.7043650362 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), 0.7043650362 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 1.8208082871 + ) + + def test_stiles_corr(self): + """Test abydos.distance.Stiles.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertAlmostEqual(self.cmp.corr('a', ''), 0.5214006292329901) + self.assertAlmostEqual(self.cmp.corr('', 'a'), 0.5214006292329901) + self.assertAlmostEqual(self.cmp.corr('abc', ''), 0.48338331691765435) + self.assertAlmostEqual(self.cmp.corr('', 'abc'), 0.48338331691765435) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertAlmostEqual( + self.cmp.corr('abcd', 'efgh'), -0.046296656196428934 + ) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.1488587676) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.1488587676) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.1488587676) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.1488587676) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.1818057652 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('a', ''), 0.9023174127372868) + self.assertAlmostEqual(self.cmp_no_d.corr('', 'a'), 0.9023174127372868) + self.assertAlmostEqual( + self.cmp_no_d.corr('abc', ''), 0.8618680547768583 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('', 'abc'), 0.8618680547768583 + ) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('abcd', 'efgh'), -0.05036926061481227 + ) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), 0.0433218759 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), 0.0433218759 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), 0.0433218759 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), 0.0433218759 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.1065811673 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_strcmp95.py b/tests/distance/test_distance_strcmp95.py index 1aff94a88..9b0e30ca3 100644 --- a/tests/distance/test_distance_strcmp95.py +++ b/tests/distance/test_distance_strcmp95.py @@ -40,6 +40,7 @@ class Strcmp95TestCases(unittest.TestCase): """ cmp = Strcmp95() + cmp_ls = Strcmp95(True) def test_strcmp95_sim(self): """Test abydos.distance.Strcmp95.sim.""" @@ -56,27 +57,19 @@ def test_strcmp95_sim(self): # long_strings = True self.assertAlmostEqual( - self.cmp.sim('DIXON', 'DICKSONX', True), 0.85393939 - ) - self.assertAlmostEqual( - self.cmp.sim('DWAYNE', 'DUANE', True), 0.89609090 - ) - self.assertAlmostEqual( - self.cmp.sim('MARTHA', 'MARHTA', True), 0.97083333 + self.cmp_ls.sim('DIXON', 'DICKSONX'), 0.85393939 ) + self.assertAlmostEqual(self.cmp_ls.sim('DWAYNE', 'DUANE'), 0.89609090) + self.assertAlmostEqual(self.cmp_ls.sim('MARTHA', 'MARHTA'), 0.97083333) # cover case where we don't boost, etc. self.assertAlmostEqual(self.cmp.sim('A', 'ABCDEFGHIJK'), 69 / 99) - self.assertAlmostEqual(self.cmp.sim('A', 'ABCDEFGHIJK', True), 69 / 99) + self.assertAlmostEqual(self.cmp_ls.sim('A', 'ABCDEFGHIJK'), 69 / 99) self.assertAlmostEqual(self.cmp.sim('d', 'abcdefgh'), 0.708333333) + self.assertAlmostEqual(self.cmp_ls.sim('d', 'abcdefgh'), 0.708333333) + self.assertAlmostEqual(self.cmp_ls.sim('1', 'abc1efgh'), 0.708333333) self.assertAlmostEqual( - self.cmp.sim('d', 'abcdefgh', True), 0.708333333 - ) - self.assertAlmostEqual( - self.cmp.sim('1', 'abc1efgh', True), 0.708333333 - ) - self.assertAlmostEqual( - self.cmp.sim('12hundredths', '12hundred', True), 0.916666667 + self.cmp_ls.sim('12hundredths', '12hundred'), 0.916666667 ) # Test wrapper diff --git a/tests/distance/test_distance_stuart_tau.py b/tests/distance/test_distance_stuart_tau.py new file mode 100644 index 000000000..82d56181f --- /dev/null +++ b/tests/distance/test_distance_stuart_tau.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_stuart_tau. + +This module contains unit tests for abydos.distance.StuartTau +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import StuartTau + + +class StuartTauTestCases(unittest.TestCase): + """Test StuartTau functions. + + abydos.distance.StuartTau + """ + + cmp = StuartTau() + cmp_no_d = StuartTau(alphabet=0) + + def test_stuart_tau_sim(self): + """Test abydos.distance.StuartTau.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5025510204081632) + self.assertEqual(self.cmp.sim('a', ''), 0.5025380049979176) + self.assertEqual(self.cmp.sim('', 'a'), 0.5025380049979176) + self.assertEqual(self.cmp.sim('abc', ''), 0.5025249895876718) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5025249895876718) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.5025510204081632) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5024859433569346) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5025119742) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5025119742) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5025119742) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5025119742) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5025054665 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.3) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4259259259 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4259259259 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4259259259 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4259259259 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_stuart_tau_dist(self): + """Test abydos.distance.StuartTau.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.49744897959183676) + self.assertEqual(self.cmp.dist('a', ''), 0.49746199500208244) + self.assertEqual(self.cmp.dist('', 'a'), 0.49746199500208244) + self.assertEqual(self.cmp.dist('abc', ''), 0.4974750104123282) + self.assertEqual(self.cmp.dist('', 'abc'), 0.4974750104123282) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.49744897959183676) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.49751405664306536) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4974880258) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4974880258) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4974880258) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4974880258) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4974945335 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.7) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5740740741 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5740740741 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5740740741 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5740740741 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_stuart_tau_corr(self): + """Test abydos.distance.StuartTau.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.00510204081632653) + self.assertEqual(self.cmp.corr('a', ''), 0.005076009995835068) + self.assertEqual(self.cmp.corr('', 'a'), 0.005076009995835068) + self.assertEqual(self.cmp.corr('abc', ''), 0.005049979175343606) + self.assertEqual(self.cmp.corr('', 'abc'), 0.005049979175343606) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.00510204081632653) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), 0.0049718867138692216) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0050239484) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0050239484) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0050239484) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0050239484) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.0050109329 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -0.4) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1481481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1481481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1481481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1481481481 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_synoname.py b/tests/distance/test_distance_synoname.py index a1f8e1d7c..14e53ab9e 100644 --- a/tests/distance/test_distance_synoname.py +++ b/tests/distance/test_distance_synoname.py @@ -194,12 +194,12 @@ def test_synoname_dist_abs(self): """Test abydos.distance.Synoname.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), 1) - self.assertEqual(self.cmp.dist_abs('', '', tests=['exact']), 1) - self.assertEqual(self.cmp.dist_abs('', '', tests=[]), 13) + self.assertEqual(Synoname(tests=['exact']).dist_abs('', ''), 1) + self.assertEqual(Synoname(tests=[]).dist_abs('', ''), 13) self.assertEqual( - self.cmp.dist_abs('', '', tests=['nonsense-test']), 13 + Synoname(tests=['nonsense-test']).dist_abs('', ''), 13 ) - self.assertEqual(self.cmp.dist_abs('', '', ret_name=True), 'exact') + self.assertEqual(Synoname(ret_name=True).dist_abs('', ''), 'exact') # Test input formats self.assertEqual( @@ -246,219 +246,174 @@ def test_synoname_dist_abs(self): # Types 1-12 self.assertEqual( - self.cmp.dist_abs( - ('Brueghel', 'Pieter', ''), - ('Brueghel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel', 'Pieter', ''), ('Brueghel', 'Pieter', '') ), 'exact', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel II', 'Pieter', ''), - ('Brueghel I', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel II', 'Pieter', ''), ('Brueghel I', 'Pieter', '') ), 'no_match', ) self.assertEqual( - self.cmp.dist_abs( - ('Breghel', 'Pieter', ''), - ('Brueghel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', '') ), 'omission', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel', 'Pieter', ''), - ('Breghel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel', 'Pieter', ''), ('Breghel', 'Pieter', '') ), 'omission', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel', 'Piter', ''), - ('Brueghel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel', 'Piter', ''), ('Brueghel', 'Pieter', '') ), 'omission', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel', 'Pieter', ''), - ('Brueghel', 'Piter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel', 'Pieter', ''), ('Brueghel', 'Piter', '') ), 'omission', ) self.assertEqual( - self.cmp.dist_abs( - ('Brughel', 'Pieter', ''), - ('Breghel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brughel', 'Pieter', ''), ('Breghel', 'Pieter', '') ), 'substitution', ) self.assertEqual( - self.cmp.dist_abs( - ('Breughel', 'Peter', ''), - ('Breughel', 'Piter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Breughel', 'Peter', ''), ('Breughel', 'Piter', '') ), 'substitution', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel', 'Pieter', ''), - ('Breughel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel', 'Pieter', ''), ('Breughel', 'Pieter', '') ), 'transposition', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel', 'Peiter', ''), - ('Brueghel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel', 'Peiter', ''), ('Brueghel', 'Pieter', '') ), 'transposition', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel:', 'Pieter', ''), - ('Brueghel', 'Pi-eter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel:', 'Pieter', ''), ('Brueghel', 'Pi-eter', '') ), 'punctuation', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel,', 'Pieter', ''), - ('Brueghel', 'Pieter...', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel,', 'Pieter', ''), ('Brueghel', 'Pieter...', '') ), 'punctuation', ) self.assertEqual( - self.cmp.dist_abs( + Synoname(ret_name=True).dist_abs( ('Seu rat', 'George Pierre', ''), ('Seu-rat', 'George-Pierre', ''), - ret_name=True, ), 'punctuation', ) self.assertEqual( - self.cmp.dist_abs( - ('Picasso', '', ''), ('Picasso', 'Pablo', ''), ret_name=True + Synoname(ret_name=True).dist_abs( + ('Picasso', '', ''), ('Picasso', 'Pablo', '') ), 'no_first', ) self.assertEqual( - self.cmp.dist_abs( - ('Pereira', 'I. R.', ''), - ('Pereira', 'Irene Rice', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Pereira', 'I. R.', ''), ('Pereira', 'Irene Rice', '') ), 'initials', ) self.assertEqual( - self.cmp.dist_abs( - ('Pereira', 'I.', ''), - ('Pereira', 'Irene Rice', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Pereira', 'I.', ''), ('Pereira', 'Irene Rice', '') ), 'initials', ) self.assertNotEqual( - self.cmp.dist_abs( - ('Pereira', 'I. R.', ''), - ('Pereira', 'I. Smith', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', '') ), 'initials', ) self.assertNotEqual( - self.cmp.dist_abs( - ('Pereira', 'I. R. S.', ''), - ('Pereira', 'I. S. R.', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Pereira', 'I. R. S.', ''), ('Pereira', 'I. S. R.', '') ), 'initials', ) self.assertEqual( - self.cmp.dist_abs( + Synoname(ret_name=True).dist_abs( ('de Goya', 'Francisco', ''), ('de Goya y Lucientes', 'Francisco', ''), - ret_name=True, ), 'extension', ) self.assertEqual( - self.cmp.dist_abs( - ('Seurat', 'George', ''), - ('Seurat', 'George-Pierre', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Seurat', 'George', ''), ('Seurat', 'George-Pierre', '') ), 'extension', ) self.assertEqual( - self.cmp.dist_abs( + Synoname(ret_name=True).dist_abs( ('Gericault', 'Theodore', ''), ('Gericault', 'Jean Louis Andre Theodore', ''), - ret_name=True, ), 'inclusion', ) self.assertEqual( - self.cmp.dist_abs( + Synoname(ret_name=True).dist_abs( ('Dore', 'Gustave', ''), ('Dore', 'Paul Gustave Louis Christophe', ''), - ret_name=True, ), 'inclusion', ) self.assertEqual( - self.cmp.dist_abs( + Synoname(ret_name=True).dist_abs( ('Rosetti', 'Dante Gabriel', ''), ('Rosetti', 'Gabriel Charles Dante', ''), - ret_name=True, ), 'word_approx', ) self.assertEqual( - self.cmp.dist_abs( + Synoname(ret_name=True).dist_abs( ('di Domenico di Bonaventura', 'Cosimo', ''), ('di Tomme di Nuto', 'Luca', ''), - ret_name=True, ), 'no_match', ) self.assertEqual( - self.cmp.dist_abs( - ('Pereira', 'I. R.', ''), - ('Pereira', 'I. Smith', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', '') ), 'word_approx', ) self.assertEqual( - self.cmp.dist_abs( + Synoname(ret_name=True).dist_abs( ('Antonello da Messina', '', ''), ('Messina', 'Antonello da', ''), - ret_name=True, ), 'confusions', ) self.assertEqual( - self.cmp.dist_abs( - ('Brueghel', 'Pietter', ''), - ('Bruegghel', 'Pieter', ''), - ret_name=True, + Synoname(ret_name=True).dist_abs( + ('Brueghel', 'Pietter', ''), ('Bruegghel', 'Pieter', '') ), 'char_approx', ) diff --git a/tests/distance/test_distance_tarantula.py b/tests/distance/test_distance_tarantula.py new file mode 100644 index 000000000..05da6df09 --- /dev/null +++ b/tests/distance/test_distance_tarantula.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tarantula. + +This module contains unit tests for abydos.distance.Tarantula +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Tarantula + + +class TarantulaTestCases(unittest.TestCase): + """Test Tarantula functions. + + abydos.distance.Tarantula + """ + + cmp = Tarantula() + cmp_no_d = Tarantula(alphabet=0) + + def test_tarantula_sim(self): + """Test abydos.distance.Tarantula.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923469388) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9939382807 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3888888889 + ) + + def test_tarantula_dist(self): + """Test abydos.distance.Tarantula.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076530612) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0060617193 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6111111111 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tarwid.py b/tests/distance/test_distance_tarwid.py new file mode 100644 index 000000000..10806f736 --- /dev/null +++ b/tests/distance/test_distance_tarwid.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tarwid. + +This module contains unit tests for abydos.distance.Tarwid +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Tarwid + + +class TarwidTestCases(unittest.TestCase): + """Test Tarwid functions. + + abydos.distance.Tarwid + """ + + cmp = Tarwid() + cmp_no_d = Tarwid(alphabet=0) + + def test_tarwid_sim(self): + """Test abydos.distance.Tarwid.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.9949238578680203) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9849246231) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9849246231) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9849246231) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9849246231) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.980350125 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.4285714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.4285714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.4285714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.4285714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.4711538462 + ) + + def test_tarwid_dist(self): + """Test abydos.distance.Tarwid.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.005076142131979711) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0150753769) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0150753769) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0150753769) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0150753769) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.019649875 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.5714285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.5714285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.5714285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.5714285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5288461538 + ) + + def test_tarwid_corr(self): + """Test abydos.distance.Tarwid.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 0.9898477157360406) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9698492462) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9698492462) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9698492462) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9698492462) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9607002501 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp_no_d.corr('Nigel', 'Niall'), -0.1428571429 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Niall', 'Nigel'), -0.1428571429 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Colin', 'Coiln'), -0.1428571429 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('Coiln', 'Colin'), -0.1428571429 + ) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.0576923077 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tetrachoric.py b/tests/distance/test_distance_tetrachoric.py new file mode 100644 index 000000000..4442184ac --- /dev/null +++ b/tests/distance/test_distance_tetrachoric.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tetrachoric. + +This module contains unit tests for abydos.distance.Tetrachoric +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Tetrachoric + + +class TetrachoricTestCases(unittest.TestCase): + """Test Tetrachoric functions. + + abydos.distance.Tetrachoric + """ + + cmp = Tetrachoric() + cmp_no_d = Tetrachoric(alphabet=0) + + def test_tetrachoric_sim(self): + """Test abydos.distance.Tetrachoric.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9915587467) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9915587467) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9915587467) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9915587467) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9949989546 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_tetrachoric_dist(self): + """Test abydos.distance.Tetrachoric.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0084412533) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0084412533) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0084412533) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0084412533) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0050010454 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_tetrachoric_corr(self): + """Test abydos.distance.Tetrachoric.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 1.0) + self.assertEqual(self.cmp.corr('', 'a'), 1.0) + self.assertEqual(self.cmp.corr('abc', ''), 1.0) + self.assertEqual(self.cmp.corr('', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9831174935) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9831174935) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9831174935) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9831174935) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9899979092 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tf_idf.py b/tests/distance/test_distance_tf_idf.py new file mode 100644 index 000000000..4830c90fe --- /dev/null +++ b/tests/distance/test_distance_tf_idf.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tf_idf. + +This module contains unit tests for abydos.distance.TFIDF +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import os +import unittest + +from abydos.corpus import UnigramCorpus +from abydos.distance import TFIDF +from abydos.tokenizer import QGrams +from abydos.util import download_package, package_path + +from six import PY2 + + +class TFIDFTestCases(unittest.TestCase): + """Test TFIDF functions. + + abydos.distance.TFIDF + """ + + cmp = TFIDF() + + def test_tf_idf_sim(self): + """Test abydos.distance.TFIDF.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.304044497) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.304044497) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.304044497) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.304044497) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4676712137 + ) + + def test_tf_idf_dist(self): + """Test abydos.distance.TFIDF.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.695955503) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.695955503) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.695955503) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.695955503) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5323287863 + ) + + def test_tf_idf_corpus(self): + """Test abydos.distance.TFIDF.sim & .dist with corpus.""" + if PY2: # disable testing in Py2.7; the pickled data isn't supported + return + + q3_corpus = UnigramCorpus(word_tokenizer=QGrams(qval=3)) + download_package('en_qgram', silent=True) + q3_corpus.load_corpus( + os.path.join(package_path('en_qgram'), 'q3_en.dat') + ) + cmp_q3 = TFIDF(tokenizer=QGrams(qval=3), corpus=q3_corpus) + + self.assertAlmostEqual(cmp_q3.sim('Nigel', 'Niall'), 0.259985047) + self.assertAlmostEqual(cmp_q3.sim('Niall', 'Nigel'), 0.259985047) + self.assertAlmostEqual(cmp_q3.sim('Colin', 'Coiln'), 0.114867563) + self.assertAlmostEqual(cmp_q3.sim('Coiln', 'Colin'), 0.114867563) + + self.assertAlmostEqual(cmp_q3.dist('Nigel', 'Niall'), 0.740014953) + self.assertAlmostEqual(cmp_q3.dist('Niall', 'Nigel'), 0.740014953) + self.assertAlmostEqual(cmp_q3.dist('Colin', 'Coiln'), 0.885132437) + self.assertAlmostEqual(cmp_q3.dist('Coiln', 'Colin'), 0.885132437) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tichy.py b/tests/distance/test_distance_tichy.py new file mode 100644 index 000000000..022ac7627 --- /dev/null +++ b/tests/distance/test_distance_tichy.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tichy. + +This module contains unit tests for abydos.distance.Tichy +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Tichy + + +class TichyTestCases(unittest.TestCase): + """Test Tichy functions. + + abydos.distance.Tichy + """ + + cmp = Tichy() + + def test_tichy_dist(self): + """Test abydos.distance.Tichy.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4444444444 + ) + + def test_tichy_sim(self): + """Test abydos.distance.Tichy.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5555555556 + ) + + def test_tichy_dist_abs(self): + """Test abydos.distance.Tichy.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0) + self.assertEqual(self.cmp.dist_abs('a', ''), 0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1) + self.assertEqual(self.cmp.dist_abs('abc', ''), 0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4) + + self.assertEqual(self.cmp.dist_abs('Nigel', 'Niall'), 4) + self.assertEqual(self.cmp.dist_abs('Niall', 'Nigel'), 4) + self.assertEqual(self.cmp.dist_abs('Colin', 'Coiln'), 4) + self.assertEqual(self.cmp.dist_abs('Coiln', 'Colin'), 4) + self.assertEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4) + + # Examples from paper + + self.assertEqual(self.cmp.dist_abs('abda', 'abcab'), 3) + self.assertEqual(self.cmp.dist_abs('shanghai', 'sakhalin'), 7) + self.assertEqual(self.cmp.dist_abs('abcde', 'deabc'), 2) + self.assertEqual(self.cmp.dist_abs('abc', 'abcabc'), 2) + self.assertEqual(self.cmp.dist_abs('abcdea', 'cdab'), 2) + self.assertEqual(self.cmp.dist_abs('abcdefdeab', 'cdeabc'), 2) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tulloss_r.py b/tests/distance/test_distance_tulloss_r.py new file mode 100644 index 000000000..f03fd6e63 --- /dev/null +++ b/tests/distance/test_distance_tulloss_r.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tulloss_r. + +This module contains unit tests for abydos.distance.TullossR +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import TullossR + + +class TullossRTestCases(unittest.TestCase): + """Test TullossR functions. + + abydos.distance.TullossR + """ + + cmp = TullossR() + + def test_tulloss_r_sim(self): + """Test abydos.distance.TullossR.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3421811272) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3421811272) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3421811272) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3421811272) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5439073716 + ) + + def test_tulloss_r_dist(self): + """Test abydos.distance.TullossR.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6578188728) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6578188728) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6578188728) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6578188728) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4560926284 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tulloss_s.py b/tests/distance/test_distance_tulloss_s.py new file mode 100644 index 000000000..a26079107 --- /dev/null +++ b/tests/distance/test_distance_tulloss_s.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tulloss_s. + +This module contains unit tests for abydos.distance.TullossS +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import TullossS + + +class TullossSTestCases(unittest.TestCase): + """Test TullossS functions. + + abydos.distance.TullossS + """ + + cmp = TullossS() + + def test_tulloss_s_sim(self): + """Test abydos.distance.TullossS.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5968309535438173) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.8277670301) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.8277670301) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8277670301) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8277670301) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8951695896 + ) + + def test_tulloss_s_dist(self): + """Test abydos.distance.TullossS.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.4031690464561827) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.1722329699) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.1722329699) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1722329699) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1722329699) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1048304104 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tulloss_t.py b/tests/distance/test_distance_tulloss_t.py new file mode 100644 index 000000000..c407d7713 --- /dev/null +++ b/tests/distance/test_distance_tulloss_t.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tulloss_t. + +This module contains unit tests for abydos.distance.TullossT +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import TullossT + + +class TullossTTestCases(unittest.TestCase): + """Test TullossT functions. + + abydos.distance.TullossT + """ + + cmp = TullossT() + + def test_tulloss_t_sim(self): + """Test abydos.distance.TullossT.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5322088457) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5322088457) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5322088457) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5322088457) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6739526335 + ) + + def test_tulloss_t_dist(self): + """Test abydos.distance.TullossT.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4677911543) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4677911543) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4677911543) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4677911543) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3260473665 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tulloss_u.py b/tests/distance/test_distance_tulloss_u.py new file mode 100644 index 000000000..58e74e54c --- /dev/null +++ b/tests/distance/test_distance_tulloss_u.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_tulloss_u. + +This module contains unit tests for abydos.distance.TullossU +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import TullossU + + +class TullossUTestCases(unittest.TestCase): + """Test TullossU functions. + + abydos.distance.TullossU + """ + + cmp = TullossU() + + def test_tulloss_u_sim(self): + """Test abydos.distance.TullossU.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9328858041 + ) + + def test_tulloss_u_dist(self): + """Test abydos.distance.TullossU.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0671141959 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_tversky.py b/tests/distance/test_distance_tversky.py index dd0e1e3f7..3a132402d 100644 --- a/tests/distance/test_distance_tversky.py +++ b/tests/distance/test_distance_tversky.py @@ -31,7 +31,7 @@ import unittest from abydos.distance import Tversky, dist_tversky, sim_tversky -from abydos.tokenizer import QGrams +from abydos.tokenizer import QGrams, WhitespaceTokenizer from .. import NONQ_FROM, NONQ_TO @@ -43,6 +43,8 @@ class TverskyIndexTestCases(unittest.TestCase): """ cmp = Tversky() + cmp_q2 = Tversky(tokenizer=QGrams(2)) + cmp_ws = Tversky(tokenizer=WhitespaceTokenizer()) def test_tversky_sim(self): """Test abydos.distance.Tversky.sim.""" @@ -51,52 +53,112 @@ def test_tversky_sim(self): self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11) - self.assertEqual(self.cmp.sim('', '', 2), 1) - self.assertEqual(self.cmp.sim('nelson', '', 2), 0) - self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) - self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 4 / 11) + self.assertEqual(self.cmp_q2.sim('', ''), 1) + self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) + self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) + self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11) # test valid alpha & beta - self.assertRaises(ValueError, self.cmp.sim, 'abcd', 'dcba', 2, -1, -1) - self.assertRaises(ValueError, self.cmp.sim, 'abcd', 'dcba', 2, -1, 0) - self.assertRaises(ValueError, self.cmp.sim, 'abcd', 'dcba', 2, 0, -1) + self.assertRaises( + ValueError, Tversky(alpha=-1.0, beta=-1.0).sim, 'abcd', 'dcba' + ) + self.assertRaises( + ValueError, Tversky(alpha=-1.0, beta=0.0).sim, 'abcd', 'dcba' + ) + self.assertRaises( + ValueError, Tversky(alpha=0.0, beta=-1.0).sim, 'abcd', 'dcba' + ) # test empty QGrams - self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 7), 0.0) + self.assertAlmostEqual( + Tversky(tokenizer=QGrams(7, start_stop='')).sim( + 'nelson', 'neilsen' + ), + 0.0, + ) # test unequal alpha & beta - self.assertAlmostEqual(self.cmp.sim('niall', 'neal', 2, 2, 1), 3 / 11) - self.assertAlmostEqual(self.cmp.sim('niall', 'neal', 2, 1, 2), 3 / 10) - self.assertAlmostEqual(self.cmp.sim('niall', 'neal', 2, 2, 2), 3 / 13) + self.assertAlmostEqual( + Tversky(alpha=2.0, beta=1.0, tokenizer=QGrams(2)).sim( + 'niall', 'neal' + ), + 3 / 11, + ) + self.assertAlmostEqual( + Tversky(alpha=1.0, beta=2.0, tokenizer=QGrams(2)).sim( + 'niall', 'neal' + ), + 3 / 10, + ) + self.assertAlmostEqual( + Tversky(alpha=2.0, beta=2.0, tokenizer=QGrams(2)).sim( + 'niall', 'neal' + ), + 3 / 13, + ) # test bias parameter self.assertAlmostEqual( - self.cmp.sim('niall', 'neal', 2, 1, 1, 0.5), 7 / 11 + Tversky(alpha=1.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).sim( + 'niall', 'neal' + ), + 7 / 11, ) self.assertAlmostEqual( - self.cmp.sim('niall', 'neal', 2, 2, 1, 0.5), 7 / 9 + Tversky(alpha=2.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).sim( + 'niall', 'neal' + ), + 7 / 9, ) self.assertAlmostEqual( - self.cmp.sim('niall', 'neal', 2, 1, 2, 0.5), 7 / 15 + Tversky(alpha=1.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).sim( + 'niall', 'neal' + ), + 7 / 15, ) self.assertAlmostEqual( - self.cmp.sim('niall', 'neal', 2, 2, 2, 0.5), 7 / 11 + Tversky(alpha=2.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).sim( + 'niall', 'neal' + ), + 7 / 11, ) # supplied q-gram tests - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) - self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) - self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.sim( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 0, + ) self.assertAlmostEqual( - self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 4 / 11 + self.cmp.sim( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 4 / 11, ) # non-q-gram tests - self.assertEqual(self.cmp.sim('', '', 0), 1) - self.assertEqual(self.cmp.sim('the quick', '', 0), 0) - self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) - self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 / 3) - self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 / 3) + self.assertEqual(self.cmp_ws.sim('', ''), 1) + self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) + self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3) + self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3) # Test wrapper self.assertAlmostEqual(sim_tversky('nelson', 'neilsen'), 4 / 11) @@ -108,54 +170,112 @@ def test_tversky_dist(self): self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11) - self.assertEqual(self.cmp.dist('', '', 2), 0) - self.assertEqual(self.cmp.dist('nelson', '', 2), 1) - self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) - self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 2), 7 / 11) + self.assertEqual(self.cmp_q2.dist('', ''), 0) + self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) + self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) + self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11) # test valid alpha & beta - self.assertRaises(ValueError, self.cmp.dist, 'abcd', 'dcba', 2, -1, -1) - self.assertRaises(ValueError, self.cmp.dist, 'abcd', 'dcba', 2, -1, 0) - self.assertRaises(ValueError, self.cmp.dist, 'abcd', 'dcba', 2, 0, -1) + self.assertRaises( + ValueError, Tversky(alpha=-1.0, beta=-1.0).dist, 'abcd', 'dcba' + ) + self.assertRaises( + ValueError, Tversky(alpha=-1.0, beta=0.0).dist, 'abcd', 'dcba' + ) + self.assertRaises( + ValueError, Tversky(alpha=0.0, beta=-1.0).dist, 'abcd', 'dcba' + ) # test empty QGrams - self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 7), 1.0) + self.assertAlmostEqual( + Tversky(tokenizer=QGrams(7, start_stop='')).dist( + 'nelson', 'neilsen' + ), + 1.0, + ) # test unequal alpha & beta - self.assertAlmostEqual(self.cmp.dist('niall', 'neal', 2, 2, 1), 8 / 11) - self.assertAlmostEqual(self.cmp.dist('niall', 'neal', 2, 1, 2), 7 / 10) self.assertAlmostEqual( - self.cmp.dist('niall', 'neal', 2, 2, 2), 10 / 13 + Tversky(alpha=2.0, beta=1.0, tokenizer=QGrams(2)).dist( + 'niall', 'neal' + ), + 8 / 11, + ) + self.assertAlmostEqual( + Tversky(alpha=1.0, beta=2.0, tokenizer=QGrams(2)).dist( + 'niall', 'neal' + ), + 7 / 10, + ) + self.assertAlmostEqual( + Tversky(alpha=2.0, beta=2.0, tokenizer=QGrams(2)).dist( + 'niall', 'neal' + ), + 10 / 13, ) # test bias parameter self.assertAlmostEqual( - self.cmp.dist('niall', 'neal', 2, 1, 1, 0.5), 4 / 11 + Tversky(alpha=1.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).dist( + 'niall', 'neal' + ), + 4 / 11, ) self.assertAlmostEqual( - self.cmp.dist('niall', 'neal', 2, 2, 1, 0.5), 2 / 9 + Tversky(alpha=2.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).dist( + 'niall', 'neal' + ), + 2 / 9, ) self.assertAlmostEqual( - self.cmp.dist('niall', 'neal', 2, 1, 2, 0.5), 8 / 15 + Tversky(alpha=1.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).dist( + 'niall', 'neal' + ), + 8 / 15, ) self.assertAlmostEqual( - self.cmp.dist('niall', 'neal', 2, 2, 2, 0.5), 4 / 11 + Tversky(alpha=2.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).dist( + 'niall', 'neal' + ), + 4 / 11, ) # supplied q-gram tests - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) - self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) - self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 0, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('').get_counter(), + ), + 1, + ) + self.assertEqual( + self.cmp.dist( + QGrams().tokenize('').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 1, + ) self.assertAlmostEqual( - self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 7 / 11 + self.cmp.dist( + QGrams().tokenize('nelson').get_counter(), + QGrams().tokenize('neilsen').get_counter(), + ), + 7 / 11, ) # non-q-gram tests - self.assertEqual(self.cmp.dist('', '', 0), 0) - self.assertEqual(self.cmp.dist('the quick', '', 0), 1) - self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) - self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 2 / 3) - self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 2 / 3) + self.assertEqual(self.cmp_ws.dist('', ''), 0) + self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) + self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3) + self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3) # Test wrapper self.assertAlmostEqual(dist_tversky('nelson', 'neilsen'), 7 / 11) diff --git a/tests/distance/test_distance_typo.py b/tests/distance/test_distance_typo.py index f755222f9..0cf42b669 100644 --- a/tests/distance/test_distance_typo.py +++ b/tests/distance/test_distance_typo.py @@ -40,6 +40,7 @@ class TypoTestCases(unittest.TestCase): """ cmp = Typo() + cmp_auto = Typo(layout='auto', failsafe=True) def test_typo_dist_abs(self): """Test abydos.distance.Typo.dist_abs.""" @@ -53,20 +54,24 @@ def test_typo_dist_abs(self): self.assertEqual(self.cmp.dist_abs('asdf', 'qsdf'), 0.5) self.assertAlmostEqual( - self.cmp.dist_abs('asdf', 'asdt', metric='euclidean'), 0.70710677 + Typo(metric='euclidean').dist_abs('asdf', 'asdt'), 0.70710677 ) self.assertAlmostEqual( - self.cmp.dist_abs('asdf', 'asdt', metric='manhattan'), 1 + Typo(metric='manhattan').dist_abs('asdf', 'asdt'), 1 ) self.assertAlmostEqual( - self.cmp.dist_abs('asdf', 'asdt', metric='log-euclidean'), - 0.4406868, + Typo(metric='log-euclidean').dist_abs('asdf', 'asdt'), 0.4406868 ) self.assertAlmostEqual( - self.cmp.dist_abs('asdf', 'asdt', metric='log-manhattan'), - 0.54930615, + Typo(metric='log-manhattan').dist_abs('asdf', 'asdt'), 0.54930615 ) + self.assertEqual(self.cmp_auto.dist_abs('Schluß', 'Schluss'), 3) + self.assertAlmostEqual( + self.cmp_auto.dist_abs('délicat', 'delicate'), 1.7071068 + ) + self.assertEqual(self.cmp_auto.dist_abs('비빔밥', 'Bibimbap'), 11) + self.assertRaises(ValueError, self.cmp.dist_abs, 'asdf', 'Ösdf') # Test wrapper @@ -86,18 +91,17 @@ def test_typo_sim(self): self.assertEqual(self.cmp.sim('asdf', 'qsdf'), 0.875) self.assertAlmostEqual( - self.cmp.sim('asdf', 'asdt', metric='euclidean'), - 1 - (0.70710677 / 4), + Typo(metric='euclidean').sim('asdf', 'asdt'), 1 - (0.70710677 / 4) ) self.assertAlmostEqual( - self.cmp.sim('asdf', 'asdt', metric='manhattan'), 0.75 + Typo(metric='manhattan').sim('asdf', 'asdt'), 0.75 ) self.assertAlmostEqual( - self.cmp.sim('asdf', 'asdt', metric='log-euclidean'), + Typo(metric='log-euclidean').sim('asdf', 'asdt'), 1 - (0.4406868 / 4), ) self.assertAlmostEqual( - self.cmp.sim('asdf', 'asdt', metric='log-manhattan'), + Typo(metric='log-manhattan').sim('asdf', 'asdt'), 1 - (0.54930615 / 4), ) @@ -119,18 +123,16 @@ def test_typo_dist(self): self.assertEqual(self.cmp.dist('asdf', 'qsdf'), 0.125) self.assertAlmostEqual( - self.cmp.dist('asdf', 'asdt', metric='euclidean'), 0.70710677 / 4 + Typo(metric='euclidean').dist('asdf', 'asdt'), 0.70710677 / 4 ) self.assertAlmostEqual( - self.cmp.dist('asdf', 'asdt', metric='manhattan'), 0.25 + Typo(metric='manhattan').dist('asdf', 'asdt'), 0.25 ) self.assertAlmostEqual( - self.cmp.dist('asdf', 'asdt', metric='log-euclidean'), - 0.4406868 / 4, + Typo(metric='log-euclidean').dist('asdf', 'asdt'), 0.4406868 / 4 ) self.assertAlmostEqual( - self.cmp.dist('asdf', 'asdt', metric='log-manhattan'), - 0.54930615 / 4, + Typo(metric='log-manhattan').dist('asdf', 'asdt'), 0.54930615 / 4 ) # Test wrapper diff --git a/tests/distance/test_distance_unigram_subtuple.py b/tests/distance/test_distance_unigram_subtuple.py new file mode 100644 index 000000000..bd95c5c3b --- /dev/null +++ b/tests/distance/test_distance_unigram_subtuple.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unigram_subtuple. + +This module contains unit tests for abydos.distance.UnigramSubtuple +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnigramSubtuple + + +class UnigramSubtupleTestCases(unittest.TestCase): + """Test UnigramSubtuple functions. + + abydos.distance.UnigramSubtuple + """ + + cmp = UnigramSubtuple() + cmp_no_d = UnigramSubtuple(alphabet=0) + + def test_unigram_subtuple_sim(self): + """Test abydos.distance.UnigramSubtuple.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.3557288160556184) + self.assertEqual(self.cmp.sim('', 'a'), 0.3557288160556184) + self.assertEqual(self.cmp.sim('abc', ''), 0.10825796687276863) + self.assertEqual(self.cmp.sim('', 'abc'), 0.10825796687276863) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6276193132) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6276193132) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6276193132) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6276193132) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7696362294 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_unigram_subtuple_dist(self): + """Test abydos.distance.UnigramSubtuple.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.6442711839443815) + self.assertEqual(self.cmp.dist('', 'a'), 0.6442711839443815) + self.assertEqual(self.cmp.dist('abc', ''), 0.8917420331272313) + self.assertEqual(self.cmp.dist('', 'abc'), 0.8917420331272313) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3723806868) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3723806868) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3723806868) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3723806868) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2303637706 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_unigram_subtuple_sim_score(self): + """Test abydos.distance.UnigramSubtuple.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.964750587602003) + self.assertEqual(self.cmp.sim_score('a', ''), 0.765430557931535) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.765430557931535) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.3365937758831885) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.3365937758831885) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 3.10918249812297) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), -0.461880260111438 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 2.2621288443 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 2.2621288443 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 2.2621288443 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 2.2621288443 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 3.3012550999 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), -6.58) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), -6.848173581803079) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), -6.848173581803079) + self.assertEqual( + self.cmp_no_d.sim_score('abc', ''), -7.317426209258153 + ) + self.assertEqual( + self.cmp_no_d.sim_score('', 'abc'), -7.317426209258153 + ) + self.assertEqual( + self.cmp_no_d.sim_score('abc', 'abc'), -4.544837487018372 + ) + self.assertEqual( + self.cmp_no_d.sim_score('abcd', 'efgh'), -8.315721908477162 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall'), -5.7513749089 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel'), -5.7513749089 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln'), -5.7513749089 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin'), -5.7513749089 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), -4.8615487946 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_a.py b/tests/distance/test_distance_unknown_a.py new file mode 100644 index 000000000..127c41981 --- /dev/null +++ b/tests/distance/test_distance_unknown_a.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_a. + +This module contains unit tests for abydos.distance.UnknownA +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownA + + +class UnknownATestCases(unittest.TestCase): + """Test UnknownA functions. + + abydos.distance.UnknownA + """ + + cmp = UnknownA() + cmp_no_d = UnknownA(alphabet=0) + + def test_unknown_a_sim(self): + """Test abydos.distance.UnknownA.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.496790757381258) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7480719794) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7480719794) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8474160207 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.35 + ) + + def test_unknown_a_dist(self): + """Test abydos.distance.UnknownA.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.503209242618742) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2519280206) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2519280206) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1525839793 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.65 + ) + + def test_unknown_a_corr(self): + """Test abydos.distance.UnknownA.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483954) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6948320413 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_b.py b/tests/distance/test_distance_unknown_b.py new file mode 100644 index 000000000..cfbe1d94a --- /dev/null +++ b/tests/distance/test_distance_unknown_b.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_b. + +This module contains unit tests for abydos.distance.UnknownB +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownB + + +class UnknownBTestCases(unittest.TestCase): + """Test UnknownB functions. + + abydos.distance.UnknownB + """ + + cmp = UnknownB() + cmp_no_d = UnknownB(alphabet=0) + + def test_unknown_b_sim(self): + """Test abydos.distance.UnknownB.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 4.1196952743799446e-05) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2461588279) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2461588279) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.439469213 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1090909091 + ) + + def test_unknown_b_dist(self): + """Test abydos.distance.UnknownB.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9999588030472562) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7538411721) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7538411721) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.560530787 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8909090909 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_c.py b/tests/distance/test_distance_unknown_c.py new file mode 100644 index 000000000..5dfd3e345 --- /dev/null +++ b/tests/distance/test_distance_unknown_c.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_c. + +This module contains unit tests for abydos.distance.UnknownC +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownC + + +class UnknownCTestCases(unittest.TestCase): + """Test UnknownC functions. + + abydos.distance.UnknownC + """ + + cmp = UnknownC() + cmp_no_d = UnknownC(alphabet=0) + + def test_unknown_c_sim(self): + """Test abydos.distance.UnknownC.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.1987163029525032) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1666666667) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.09577771 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.1666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1926686589 + ) + + def test_unknown_c_dist(self): + """Test abydos.distance.UnknownC.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.8012836970474968) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8333333333) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.90422229 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.8333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8073313411 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_d.py b/tests/distance/test_distance_unknown_d.py new file mode 100644 index 000000000..82aaa4e9e --- /dev/null +++ b/tests/distance/test_distance_unknown_d.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_d. + +This module contains unit tests for abydos.distance.UnknownD +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownD + + +class UnknownDTestCases(unittest.TestCase): + """Test UnknownD functions. + + abydos.distance.UnknownD + """ + + cmp = UnknownD() + cmp_no_d = UnknownD(alphabet=0) + + def test_unknown_d_sim(self): + """Test abydos.distance.UnknownD.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.006377551020408163) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0076530612) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0076530612) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0076530612) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0076530612) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.016934801 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.7692307692 + ) + + def test_unknown_d_dist(self): + """Test abydos.distance.UnknownD.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9936224489795918) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9923469388) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9923469388) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9923469388) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9923469388) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.983065199 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.5) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.2307692308 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_e.py b/tests/distance/test_distance_unknown_e.py new file mode 100644 index 000000000..df4a2386b --- /dev/null +++ b/tests/distance/test_distance_unknown_e.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_e. + +This module contains unit tests for abydos.distance.UnknownE +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownE + + +class UnknownETestCases(unittest.TestCase): + """Test UnknownE functions. + + abydos.distance.UnknownE + """ + + cmp = UnknownE() + cmp_no_d = UnknownE(alphabet=0) + + def test_unknown_e_sim(self): + """Test abydos.distance.UnknownE.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_unknown_e_dist(self): + """Test abydos.distance.UnknownE.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_unknown_e_corr(self): + """Test abydos.distance.UnknownE.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_f.py b/tests/distance/test_distance_unknown_f.py new file mode 100644 index 000000000..f870f42b6 --- /dev/null +++ b/tests/distance/test_distance_unknown_f.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_f. + +This module contains unit tests for abydos.distance.UnknownF +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownF + + +class UnknownFTestCases(unittest.TestCase): + """Test UnknownF functions. + + abydos.distance.UnknownF + """ + + cmp = UnknownF() + cmp_no_d = UnknownF(alphabet=0) + + def test_unknown_f_sim_score(self): + """Test abydos.distance.UnknownF.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 1.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 0.3068528194 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 0.3068528194 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 0.3068528194 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 0.3068528194 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.5956699662 + ) + + # Exceptions + self.assertRaises(NotImplementedError, self.cmp.sim, 'a', 'a') + self.assertRaises(NotImplementedError, self.cmp.dist, 'a', 'a') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_g.py b/tests/distance/test_distance_unknown_g.py new file mode 100644 index 000000000..c07aedd88 --- /dev/null +++ b/tests/distance/test_distance_unknown_g.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_g. + +This module contains unit tests for abydos.distance.UnknownG +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownG + + +class UnknownGTestCases(unittest.TestCase): + """Test UnknownG functions. + + abydos.distance.UnknownG + """ + + cmp = UnknownG() + + def test_unknown_g_sim(self): + """Test abydos.distance.UnknownG.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6681818182 + ) + + def test_unknown_g_dist(self): + """Test abydos.distance.UnknownG.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3318181818 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_h.py b/tests/distance/test_distance_unknown_h.py new file mode 100644 index 000000000..feb37d5d5 --- /dev/null +++ b/tests/distance/test_distance_unknown_h.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_h. + +This module contains unit tests for abydos.distance.UnknownH +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownH + + +class UnknownHTestCases(unittest.TestCase): + """Test UnknownH functions. + + abydos.distance.UnknownH + """ + + cmp = UnknownH() + cmp_no_d = UnknownH(alphabet=0) + + def test_unknown_h_sim(self): + """Test abydos.distance.UnknownH.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.75) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2958758548) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2958758548) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2958758548) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2958758548) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5093099295 + ) + + def test_unknown_h_dist(self): + """Test abydos.distance.UnknownH.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.25) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7041241452) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7041241452) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7041241452) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7041241452) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4906900705 + ) + + def test_unknown_h_sim_score(self): + """Test abydos.distance.UnknownH.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 0.75) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), -0.22360679774997896 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 0.2958758548 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.5093099295 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_i.py b/tests/distance/test_distance_unknown_i.py new file mode 100644 index 000000000..3f82096df --- /dev/null +++ b/tests/distance/test_distance_unknown_i.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_i. + +This module contains unit tests for abydos.distance.UnknownI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownI + + +class UnknownITestCases(unittest.TestCase): + """Test UnknownI functions. + + abydos.distance.UnknownI + """ + + cmp = UnknownI() + + def test_unknown_i_sim(self): + """Test abydos.distance.UnknownI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.2) + self.assertEqual(self.cmp.sim('', 'a'), 0.2) + self.assertEqual(self.cmp.sim('abc', ''), 0.125) + self.assertEqual(self.cmp.sim('', 'abc'), 0.125) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.8333333333333334) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.023809523809523808) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.125) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.125) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.125) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.125) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.1428571429 + ) + + def test_unknown_i_dist(self): + """Test abydos.distance.UnknownI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.8) + self.assertEqual(self.cmp.dist('', 'a'), 0.8) + self.assertEqual(self.cmp.dist('abc', ''), 0.875) + self.assertEqual(self.cmp.dist('', 'abc'), 0.875) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.16666666666666663) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.9761904761904762) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.875) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.875) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.875) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.875) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.8571428571 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_j.py b/tests/distance/test_distance_unknown_j.py new file mode 100644 index 000000000..cb33dc14d --- /dev/null +++ b/tests/distance/test_distance_unknown_j.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_j. + +This module contains unit tests for abydos.distance.UnknownJ +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownJ + + +class UnknownJTestCases(unittest.TestCase): + """Test UnknownJ functions. + + abydos.distance.UnknownJ + """ + + cmp = UnknownJ() + cmp_no_d = UnknownJ(alphabet=0) + + def test_unknown_j_sim(self): + """Test abydos.distance.UnknownJ.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.1785714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.1785714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.1785714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.1785714286 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.1988636364 + ) + + def test_unknown_j_dist(self): + """Test abydos.distance.UnknownJ.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.8214285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.8214285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.8214285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.8214285714 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.8011363636 + ) + + def test_unknown_j_sim_score(self): + """Test abydos.distance.UnknownJ.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 1.005128205128205) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 0.5038560411 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 0.5038560411 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 0.5038560411 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 0.5038560411 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.6454192638 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 5.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 1.25) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 1.25) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 1.25) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 1.25) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 2.3863636364 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_k.py b/tests/distance/test_distance_unknown_k.py new file mode 100644 index 000000000..dd0947627 --- /dev/null +++ b/tests/distance/test_distance_unknown_k.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_k. + +This module contains unit tests for abydos.distance.UnknownK +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownK + + +class UnknownKTestCases(unittest.TestCase): + """Test UnknownK functions. + + abydos.distance.UnknownK + """ + + cmp = UnknownK() + cmp_no_d = UnknownK(alphabet=0) + + def test_unknown_k_dist(self): + """Test abydos.distance.UnknownK.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.9948979591836735) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9961734694) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9961734694) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9961734694) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9961734694) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.9910714286 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_unknown_k_sim(self): + """Test abydos.distance.UnknownK.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.005102040816326481) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0038265306) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0038265306) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0038265306) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0038265306) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0089285714 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 + ) + + def test_unknown_k_dist_abs(self): + """Test abydos.distance.UnknownK.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 784.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 784.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 784.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 784.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 784.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 780.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 784.0) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 781.0) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 781.0) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 781.0) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 781.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 777.0 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('a', ''), 2.0) + self.assertEqual(self.cmp_no_d.dist_abs('', 'a'), 2.0) + self.assertEqual(self.cmp_no_d.dist_abs('abc', ''), 4.0) + self.assertEqual(self.cmp_no_d.dist_abs('', 'abc'), 4.0) + self.assertEqual(self.cmp_no_d.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('abcd', 'efgh'), 10.0) + + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Nigel', 'Niall'), 6.0) + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Niall', 'Nigel'), 6.0) + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Colin', 'Coiln'), 6.0) + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Coiln', 'Colin'), 6.0) + self.assertAlmostEqual( + self.cmp_no_d.dist_abs('ATCAACGAGT', 'AACGATTAG'), 7.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_l.py b/tests/distance/test_distance_unknown_l.py new file mode 100644 index 000000000..a5927c5f7 --- /dev/null +++ b/tests/distance/test_distance_unknown_l.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_l. + +This module contains unit tests for abydos.distance.UnknownL +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownL + + +class UnknownLTestCases(unittest.TestCase): + """Test UnknownL functions. + + abydos.distance.UnknownL + """ + + cmp = UnknownL() + cmp_no_d = UnknownL(alphabet=0) + + def test_unknown_l_sim(self): + """Test abydos.distance.UnknownL.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.9872448979591837) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9923469388) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9923469388) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9923371648 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual( + self.cmp_no_d.sim('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.5384615385 + ) + + def test_unknown_l_dist(self): + """Test abydos.distance.UnknownL.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816313) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076530612) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076530612) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0076628352 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual( + self.cmp_no_d.dist('Nigel', 'Niall'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Niall', 'Nigel'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Colin', 'Coiln'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('Coiln', 'Colin'), 0.6666666667 + ) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.4615384615 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_unknown_m.py b/tests/distance/test_distance_unknown_m.py new file mode 100644 index 000000000..659a19a32 --- /dev/null +++ b/tests/distance/test_distance_unknown_m.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_unknown_m. + +This module contains unit tests for abydos.distance.UnknownM +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import UnknownM + + +class UnknownMTestCases(unittest.TestCase): + """Test UnknownM functions. + + abydos.distance.UnknownM + """ + + cmp = UnknownM() + cmp_no_d = UnknownM(alphabet=0) + + def test_unknown_m_sim(self): + """Test abydos.distance.UnknownM.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.0) + self.assertEqual(self.cmp.sim('a', ''), 0.14599478380307845) + self.assertEqual(self.cmp.sim('', 'a'), 0.14599478380307845) + self.assertEqual(self.cmp.sim('abc', ''), 0.24935979408619846) + self.assertEqual(self.cmp.sim('', 'abc'), 0.24935979408619846) + self.assertEqual(self.cmp.sim('abc', 'abc'), 0.8743589743589744) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.3993581514762516) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6650599829) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6650599829) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6650599829) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6650599829) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7838816809 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.3) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.25) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.25) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.3073313411 + ) + + def test_unknown_m_dist(self): + """Test abydos.distance.UnknownM.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 1.0) + self.assertEqual(self.cmp.dist('a', ''), 0.8540052161969216) + self.assertEqual(self.cmp.dist('', 'a'), 0.8540052161969216) + self.assertEqual(self.cmp.dist('abc', ''), 0.7506402059138015) + self.assertEqual(self.cmp.dist('', 'abc'), 0.7506402059138015) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.12564102564102564) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6006418485237484) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3349400171) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3349400171) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3349400171) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3349400171) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2161183191 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 0.7) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 0.75) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 0.75) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 0.6926686589 + ) + + def test_unknown_m_sim_score(self): + """Test abydos.distance.UnknownM.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 1.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.7080104323938431) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.7080104323938431) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.5012804118276031) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.5012804118276031) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), -0.7487179487179487) + self.assertEqual( + self.cmp.sim_score('abcd', 'efgh'), 0.2012836970474968 + ) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), -0.3301199657 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), -0.3301199657 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), -0.3301199657 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), -0.3301199657 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), -0.5677633618 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 0.4) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 0.3853373178 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_upholt.py b/tests/distance/test_distance_upholt.py new file mode 100644 index 000000000..2d4e80a6a --- /dev/null +++ b/tests/distance/test_distance_upholt.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_upholt. + +This module contains unit tests for abydos.distance.Upholt +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Upholt + + +class UpholtTestCases(unittest.TestCase): + """Test Upholt functions. + + abydos.distance.Upholt + """ + + cmp = Upholt() + cmp_no_d = Upholt(alphabet=0) + + def test_upholt_sim(self): + """Test abydos.distance.Upholt.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.7807764064) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.7807764064) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7807764064) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7807764064) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.8685170918 + ) + + def test_upholt_dist(self): + """Test abydos.distance.Upholt.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.2192235936) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.2192235936) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2192235936) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2192235936) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.1314829082 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_warrens_i.py b/tests/distance/test_distance_warrens_i.py new file mode 100644 index 000000000..5eb647c3e --- /dev/null +++ b/tests/distance/test_distance_warrens_i.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_warrens_i. + +This module contains unit tests for abydos.distance.WarrensI +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import WarrensI + + +class WarrensITestCases(unittest.TestCase): + """Test WarrensI functions. + + abydos.distance.WarrensI + """ + + cmp = WarrensI() + + def test_warrens_i_sim(self): + """Test abydos.distance.WarrensI.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667 + ) + + def test_warrens_i_dist(self): + """Test abydos.distance.WarrensI.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + def test_warrens_i_corr(self): + """Test abydos.distance.WarrensI.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'a'), -1.0) + self.assertEqual(self.cmp.corr('abc', ''), -1.0) + self.assertEqual(self.cmp.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.3333333333 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_warrens_ii.py b/tests/distance/test_distance_warrens_ii.py new file mode 100644 index 000000000..1322faff9 --- /dev/null +++ b/tests/distance/test_distance_warrens_ii.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_warrens_ii. + +This module contains unit tests for abydos.distance.WarrensII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import WarrensII + + +class WarrensIITestCases(unittest.TestCase): + """Test WarrensII functions. + + abydos.distance.WarrensII + """ + + cmp = WarrensII() + cmp_no_d = WarrensII(alphabet=0) + + def test_warrens_ii_sim(self): + """Test abydos.distance.WarrensII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987228607918263) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987228607918263) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974424552429667) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974424552429667) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.993581514762516) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9954751131 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_warrens_ii_dist(self): + """Test abydos.distance.WarrensII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('', 'a'), 0.0012771392081737387) + self.assertEqual(self.cmp.dist('abc', ''), 0.002557544757033292) + self.assertEqual(self.cmp.dist('', 'abc'), 0.002557544757033292) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.006418485237484006) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038560411) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0045248869 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_warrens_iii.py b/tests/distance/test_distance_warrens_iii.py new file mode 100644 index 000000000..f1613d715 --- /dev/null +++ b/tests/distance/test_distance_warrens_iii.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_warrens_iii. + +This module contains unit tests for abydos.distance.WarrensIII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import WarrensIII + + +class WarrensIIITestCases(unittest.TestCase): + """Test WarrensIII functions. + + abydos.distance.WarrensIII + """ + + cmp = WarrensIII() + cmp_no_d = WarrensIII(alphabet=0) + cmp_2_1 = WarrensIII(alphabet=2, qval=1) + + def test_warrens_iii_sim(self): + """Test abydos.distance.WarrensIII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.9987228607918264) + self.assertEqual(self.cmp.sim('', 'a'), 0.9987228607918264) + self.assertEqual(self.cmp.sim('abc', ''), 0.9974424552429668) + self.assertEqual(self.cmp.sim('', 'abc'), 0.9974424552429668) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.993581514762516) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9954751131 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + self.assertEqual(self.cmp_2_1.sim('CG', 'GC'), 0.5) + + def test_warrens_iii_dist(self): + """Test abydos.distance.WarrensIII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0012771392081736277) + self.assertEqual(self.cmp.dist('', 'a'), 0.0012771392081736277) + self.assertEqual(self.cmp.dist('abc', ''), 0.002557544757033181) + self.assertEqual(self.cmp.dist('', 'abc'), 0.002557544757033181) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.006418485237484006) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038560411) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0045248869 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_warrens_iii_corr(self): + """Test abydos.distance.WarrensIII.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 1.0) + self.assertEqual(self.cmp.corr('a', ''), 0.9974457215836526) + self.assertEqual(self.cmp.corr('', 'a'), 0.9974457215836526) + self.assertEqual(self.cmp.corr('abc', ''), 0.9948849104859335) + self.assertEqual(self.cmp.corr('', 'abc'), 0.9948849104859335) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), 0.9871630295250321) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9922879177) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9909502262 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_warrens_iv.py b/tests/distance/test_distance_warrens_iv.py new file mode 100644 index 000000000..4628457a5 --- /dev/null +++ b/tests/distance/test_distance_warrens_iv.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_warrens_iv. + +This module contains unit tests for abydos.distance.WarrensIV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import WarrensIV + + +class WarrensIVTestCases(unittest.TestCase): + """Test WarrensIV functions. + + abydos.distance.WarrensIV + """ + + cmp = WarrensIV() + cmp_no_d = WarrensIV(alphabet=0) + + def test_warrens_iv_sim(self): + """Test abydos.distance.WarrensIV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6658075601) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6658075601) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6658075601) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6658075601) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7985480944 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_warrens_iv_dist(self): + """Test abydos.distance.WarrensIV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3341924399) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3341924399) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3341924399) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3341924399) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2014519056 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_warrens_v.py b/tests/distance/test_distance_warrens_v.py new file mode 100644 index 000000000..b6f963cef --- /dev/null +++ b/tests/distance/test_distance_warrens_v.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_warrens_v. + +This module contains unit tests for abydos.distance.WarrensV +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import WarrensV + + +class WarrensVTestCases(unittest.TestCase): + """Test WarrensV functions. + + abydos.distance.WarrensV + """ + + cmp = WarrensV() + cmp_no_d = WarrensV(alphabet=0) + + def test_warrens_v_sim(self): + """Test abydos.distance.WarrensV.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_warrens_v_dist(self): + """Test abydos.distance.WarrensV.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_warrens_v_sim_score(self): + """Test abydos.distance.WarrensV.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 195.0) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 64.3333333333 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 48.8909090909 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_weighted_jaccard.py b/tests/distance/test_distance_weighted_jaccard.py new file mode 100644 index 000000000..ea26497fd --- /dev/null +++ b/tests/distance/test_distance_weighted_jaccard.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_weighted_jaccard. + +This module contains unit tests for abydos.distance.WeightedJaccard +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import WeightedJaccard + + +class WeightedJaccardTestCases(unittest.TestCase): + """Test WeightedJaccard functions. + + abydos.distance.WeightedJaccard + """ + + cmp = WeightedJaccard() + + def test_weighted_jaccard_sim(self): + """Test abydos.distance.WeightedJaccard.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.75) + + def test_weighted_jaccard_dist(self): + """Test abydos.distance.WeightedJaccard.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.4) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.4) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.25) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_whittaker.py b/tests/distance/test_distance_whittaker.py new file mode 100644 index 000000000..15a37efdc --- /dev/null +++ b/tests/distance/test_distance_whittaker.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_whittaker. + +This module contains unit tests for abydos.distance.Whittaker +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import Whittaker + + +class WhittakerTestCases(unittest.TestCase): + """Test Whittaker functions. + + abydos.distance.Whittaker + """ + + cmp = Whittaker() + + def test_whittaker_sim(self): + """Test abydos.distance.Whittaker.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6363636364 + ) + + def test_whittaker_dist(self): + """Test abydos.distance.Whittaker.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.5) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.5) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3636363636 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_yates_chi_squared.py b/tests/distance/test_distance_yates_chi_squared.py new file mode 100644 index 000000000..d882aa1c5 --- /dev/null +++ b/tests/distance/test_distance_yates_chi_squared.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_yates_chi_squared. + +This module contains unit tests for abydos.distance.YatesChiSquared +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import YatesChiSquared + + +class YatesChiSquaredTestCases(unittest.TestCase): + """Test YatesChiSquared functions. + + abydos.distance.YatesChiSquared + """ + + cmp = YatesChiSquared() + cmp_no_d = YatesChiSquared(alphabet=0) + cmp_4q1 = YatesChiSquared(qval=1, alphabet=6) + + def test_yates_chi_squared_sim(self): + """Test abydos.distance.YatesChiSquared.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.2024579068) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.2024579068) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.2024579068) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.2024579068) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.415132719 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_yates_chi_squared_dist(self): + """Test abydos.distance.YatesChiSquared.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.7975420932) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.7975420932) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.7975420932) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.7975420932) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.584867281 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_yates_chi_squared_sim_score(self): + """Test abydos.distance.YatesChiSquared.sim_score.""" + # Base cases + self.assertEqual(self.cmp.sim_score('', ''), 0.0) + self.assertEqual(self.cmp.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim_score('abc', 'abc'), 599.3708349769888) + self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 6.960385076156687) + + self.assertAlmostEqual( + self.cmp.sim_score('Nigel', 'Niall'), 133.1878178031 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Niall', 'Nigel'), 133.1878178031 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Colin', 'Coiln'), 133.1878178031 + ) + self.assertAlmostEqual( + self.cmp.sim_score('Coiln', 'Colin'), 133.1878178031 + ) + self.assertAlmostEqual( + self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 296.1470911771 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 1.0) + self.assertEqual( + self.cmp_no_d.sim_score('abcd', 'efgh', signed=True), -6.4 + ) + + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Nigel', 'Niall', signed=True), -0.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Niall', 'Nigel', signed=True), -0.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Colin', 'Coiln', signed=True), -0.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('Coiln', 'Colin', signed=True), -0.5625 + ) + self.assertAlmostEqual( + self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG', signed=True), + -0.2651515152, + ) + + self.assertEqual(self.cmp_4q1.sim_score('tab', 'tac'), 0.0) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_yjhhr.py b/tests/distance/test_distance_yjhhr.py new file mode 100644 index 000000000..759c8e719 --- /dev/null +++ b/tests/distance/test_distance_yjhhr.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_yjhhr. + +This module contains unit tests for abydos.distance.YJHHR +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import YJHHR + + +class YJHHRTestCases(unittest.TestCase): + """Test YJHHR functions. + + abydos.distance.YJHHR + """ + + cmp = YJHHR() + cmp_p3 = YJHHR(pval=3) + + def test_yjhhr_dist(self): + """Test abydos.distance.YJHHR.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.6666666666) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.6666666666) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.6666666666) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.6666666666) + self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5) + + # Base cases + self.assertEqual(self.cmp_p3.dist('', ''), 0.0) + self.assertEqual(self.cmp_p3.dist('a', ''), 1.0) + self.assertEqual(self.cmp_p3.dist('', 'a'), 1.0) + self.assertEqual(self.cmp_p3.dist('abc', ''), 1.0) + self.assertEqual(self.cmp_p3.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp_p3.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_p3.dist('abcd', 'efgh'), 0.6299605249474369) + + self.assertAlmostEqual( + self.cmp_p3.dist('Nigel', 'Niall'), 0.4199736833 + ) + self.assertAlmostEqual( + self.cmp_p3.dist('Niall', 'Nigel'), 0.4199736833 + ) + self.assertAlmostEqual( + self.cmp_p3.dist('Colin', 'Coiln'), 0.4199736833 + ) + self.assertAlmostEqual( + self.cmp_p3.dist('Coiln', 'Colin'), 0.4199736833 + ) + self.assertAlmostEqual( + self.cmp_p3.dist('ATCAACGAGT', 'AACGATTAG'), 0.32128153180538643 + ) + + def test_yjhhr_sim(self): + """Test abydos.distance.YJHHR.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5) + + # Base cases + self.assertEqual(self.cmp_p3.sim('', ''), 1.0) + self.assertEqual(self.cmp_p3.sim('a', ''), 0.0) + self.assertEqual(self.cmp_p3.sim('', 'a'), 0.0) + self.assertEqual(self.cmp_p3.sim('abc', ''), 0.0) + self.assertEqual(self.cmp_p3.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp_p3.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_p3.sim('abcd', 'efgh'), 0.37003947505256307) + + self.assertAlmostEqual(self.cmp_p3.sim('Nigel', 'Niall'), 0.5800263167) + self.assertAlmostEqual(self.cmp_p3.sim('Niall', 'Nigel'), 0.5800263167) + self.assertAlmostEqual(self.cmp_p3.sim('Colin', 'Coiln'), 0.5800263167) + self.assertAlmostEqual(self.cmp_p3.sim('Coiln', 'Colin'), 0.5800263167) + self.assertAlmostEqual( + self.cmp_p3.sim('ATCAACGAGT', 'AACGATTAG'), 0.6787184681946136 + ) + + def test_yjhhr_dist_abs(self): + """Test abydos.distance.YJHHR.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 2.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 2.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 4.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 4.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 10.0) + + self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 6.0) + self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 6.0) + self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 6.0) + self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 6.0) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 7.0 + ) + + # Base cases + self.assertEqual(self.cmp_p3.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp_p3.dist_abs('a', ''), 2.0) + self.assertEqual(self.cmp_p3.dist_abs('', 'a'), 2.0) + self.assertEqual(self.cmp_p3.dist_abs('abc', ''), 4.0) + self.assertEqual(self.cmp_p3.dist_abs('', 'abc'), 4.0) + self.assertEqual(self.cmp_p3.dist_abs('abc', 'abc'), 0.0) + self.assertEqual( + self.cmp_p3.dist_abs('abcd', 'efgh'), 6.29960524947437 + ) + + self.assertAlmostEqual( + self.cmp_p3.dist_abs('Nigel', 'Niall'), 3.77976314968462 + ) + self.assertAlmostEqual( + self.cmp_p3.dist_abs('Niall', 'Nigel'), 3.77976314968462 + ) + self.assertAlmostEqual( + self.cmp_p3.dist_abs('Colin', 'Coiln'), 3.77976314968462 + ) + self.assertAlmostEqual( + self.cmp_p3.dist_abs('Coiln', 'Colin'), 3.77976314968462 + ) + self.assertAlmostEqual( + self.cmp_p3.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4.49794144527541 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_yujian_bo.py b/tests/distance/test_distance_yujian_bo.py new file mode 100644 index 000000000..13629dd02 --- /dev/null +++ b/tests/distance/test_distance_yujian_bo.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_yujian_bo. + +This module contains unit tests for abydos.distance.YujianBo +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import YujianBo + + +class YujianBoTestCases(unittest.TestCase): + """Test YujianBo functions. + + abydos.distance.YujianBo + """ + + cmp = YujianBo() + + def test_yujian_bo_dist(self): + """Test abydos.distance.YujianBo.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'a'), 1.0) + self.assertEqual(self.cmp.dist('abc', ''), 1.0) + self.assertEqual(self.cmp.dist('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6666666666666666) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3333333333) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3333333333) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4166666667 + ) + + def test_yujian_bo_sim(self): + """Test abydos.distance.YujianBo.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'a'), 0.0) + self.assertEqual(self.cmp.sim('abc', ''), 0.0) + self.assertEqual(self.cmp.sim('', 'abc'), 0.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.33333333333333337) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6666666667) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6666666667) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5833333333 + ) + + def test_yujian_bo_dist_abs(self): + """Test abydos.distance.YujianBo.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 1.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 1.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 0.6666666666666666) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 0.3333333333 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.4166666667 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_yule_q.py b/tests/distance/test_distance_yule_q.py new file mode 100644 index 000000000..2c1abbdaa --- /dev/null +++ b/tests/distance/test_distance_yule_q.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_yule_q. + +This module contains unit tests for abydos.distance.YuleQ +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import YuleQ + + +class YuleQTestCases(unittest.TestCase): + """Test YuleQ functions. + + abydos.distance.YuleQ + """ + + cmp = YuleQ() + cmp_no_d = YuleQ(alphabet=0) + + def test_yule_q_sim(self): + """Test abydos.distance.YuleQ.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9977786005 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_yule_q_dist(self): + """Test abydos.distance.YuleQ.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038560411) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0022213995 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_yule_q_corr(self): + """Test abydos.distance.YuleQ.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9922879177) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9922879177) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.995557201 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_yule_q_ii.py b/tests/distance/test_distance_yule_q_ii.py new file mode 100644 index 000000000..068ed2eb9 --- /dev/null +++ b/tests/distance/test_distance_yule_q_ii.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_yule_q_ii. + +This module contains unit tests for abydos.distance.YuleQII +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import YuleQII + + +class YuleQIITestCases(unittest.TestCase): + """Test YuleQII functions. + + abydos.distance.YuleQII + """ + + cmp = YuleQII() + cmp_no_d = YuleQII(alphabet=0) + + def test_yule_q_ii_dist(self): + """Test abydos.distance.YuleQII.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.0) + self.assertEqual(self.cmp.dist('a', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'a'), 0.0) + self.assertEqual(self.cmp.dist('abc', ''), 0.0) + self.assertEqual(self.cmp.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0038560411) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0038560411) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0022213995 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_yule_q_ii_sim(self): + """Test abydos.distance.YuleQII.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 1.0) + self.assertEqual(self.cmp.sim('a', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'a'), 1.0) + self.assertEqual(self.cmp.sim('abc', ''), 1.0) + self.assertEqual(self.cmp.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9961439589) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9961439589) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9977786005 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('a', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 1.0) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_yule_q_ii_dist_abs(self): + """Test abydos.distance.YuleQII.dist_abs.""" + # Base cases + self.assertEqual(self.cmp.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('a', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('', 'a'), 0.0) + self.assertEqual(self.cmp.dist_abs('abc', ''), 0.0) + self.assertEqual(self.cmp.dist_abs('', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 2.0) + + self.assertAlmostEqual( + self.cmp.dist_abs('Nigel', 'Niall'), 0.0077120823 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Niall', 'Nigel'), 0.0077120823 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Colin', 'Coiln'), 0.0077120823 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('Coiln', 'Colin'), 0.0077120823 + ) + self.assertAlmostEqual( + self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0.004442799 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist_abs('', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.dist_abs('abcd', 'efgh'), 2.0) + + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Nigel', 'Niall'), 2.0) + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Niall', 'Nigel'), 2.0) + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Colin', 'Coiln'), 2.0) + self.assertAlmostEqual(self.cmp_no_d.dist_abs('Coiln', 'Colin'), 2.0) + self.assertAlmostEqual( + self.cmp_no_d.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/distance/test_distance_yule_y.py b/tests/distance/test_distance_yule_y.py new file mode 100644 index 000000000..f7d5a200c --- /dev/null +++ b/tests/distance/test_distance_yule_y.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.distance.test_distance_yule_y. + +This module contains unit tests for abydos.distance.YuleY +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.distance import YuleY + + +class YuleYTestCases(unittest.TestCase): + """Test YuleY functions. + + abydos.distance.YuleY + """ + + cmp = YuleY() + cmp_no_d = YuleY(alphabet=0) + + def test_yule_y_sim(self): + """Test abydos.distance.YuleY.sim.""" + # Base cases + self.assertEqual(self.cmp.sim('', ''), 0.5) + self.assertEqual(self.cmp.sim('a', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'a'), 0.5) + self.assertEqual(self.cmp.sim('abc', ''), 0.5) + self.assertEqual(self.cmp.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.9414271324) + self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.9414271324) + self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.9414271324) + self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.9414271324) + self.assertAlmostEqual( + self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.9549418688 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.sim('', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.sim('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.sim('abcd', 'efgh'), 0.0) + + self.assertAlmostEqual(self.cmp_no_d.sim('Nigel', 'Niall'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Niall', 'Nigel'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Colin', 'Coiln'), 0.0) + self.assertAlmostEqual(self.cmp_no_d.sim('Coiln', 'Colin'), 0.0) + self.assertAlmostEqual( + self.cmp_no_d.sim('ATCAACGAGT', 'AACGATTAG'), 0.0 + ) + + def test_yule_y_dist(self): + """Test abydos.distance.YuleY.dist.""" + # Base cases + self.assertEqual(self.cmp.dist('', ''), 0.5) + self.assertEqual(self.cmp.dist('a', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'a'), 0.5) + self.assertEqual(self.cmp.dist('abc', ''), 0.5) + self.assertEqual(self.cmp.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) + self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0585728676) + self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0585728676) + self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0585728676) + self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0585728676) + self.assertAlmostEqual( + self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0450581312 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.dist('', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('a', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'a'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', ''), 0.5) + self.assertEqual(self.cmp_no_d.dist('', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abc', 'abc'), 0.5) + self.assertEqual(self.cmp_no_d.dist('abcd', 'efgh'), 1.0) + + self.assertAlmostEqual(self.cmp_no_d.dist('Nigel', 'Niall'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Niall', 'Nigel'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Colin', 'Coiln'), 1.0) + self.assertAlmostEqual(self.cmp_no_d.dist('Coiln', 'Colin'), 1.0) + self.assertAlmostEqual( + self.cmp_no_d.dist('ATCAACGAGT', 'AACGATTAG'), 1.0 + ) + + def test_yule_y_corr(self): + """Test abydos.distance.YuleY.corr.""" + # Base cases + self.assertEqual(self.cmp.corr('', ''), 0.0) + self.assertEqual(self.cmp.corr('a', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'a'), 0.0) + self.assertEqual(self.cmp.corr('abc', ''), 0.0) + self.assertEqual(self.cmp.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) + self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.8828542648) + self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.8828542648) + self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.8828542648) + self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.8828542648) + self.assertAlmostEqual( + self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9098837375 + ) + + # Tests with alphabet=0 (no d factor) + self.assertEqual(self.cmp_no_d.corr('', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('a', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'a'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', ''), 0.0) + self.assertEqual(self.cmp_no_d.corr('', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 0.0) + self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) + + self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) + self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) + self.assertAlmostEqual( + self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/fingerprint/test_fingerprint_count.py b/tests/fingerprint/test_fingerprint_count.py index a96f0123b..7905f08f5 100644 --- a/tests/fingerprint/test_fingerprint_count.py +++ b/tests/fingerprint/test_fingerprint_count.py @@ -50,15 +50,13 @@ def test_count_fingerprint(self): self.assertEqual(self.fp.fingerprint('instance'), 0b0101010001100100) self.assertEqual(self.fp.fingerprint('inst'), 0b0001000001010100) + self.assertEqual(Count(15).fingerprint('instance'), 0b0101010001100100) self.assertEqual( - self.fp.fingerprint('instance', 15), 0b0101010001100100 - ) - self.assertEqual( - self.fp.fingerprint('instance', 32), + Count(32).fingerprint('instance'), 0b01010100011001000000000100000000, ) self.assertEqual( - self.fp.fingerprint('instance', 64), + Count(64).fingerprint('instance'), 0b01010100011001000000000100000000 << 32, ) diff --git a/tests/fingerprint/test_fingerprint_occurrence.py b/tests/fingerprint/test_fingerprint_occurrence.py index 75187e5cb..9f223c4f3 100644 --- a/tests/fingerprint/test_fingerprint_occurrence.py +++ b/tests/fingerprint/test_fingerprint_occurrence.py @@ -51,14 +51,14 @@ def test_occurrence_fingerprint(self): self.assertEqual(self.fp.fingerprint('inst'), 0b0100111000000000) self.assertEqual( - self.fp.fingerprint('instance', 15), 0b111011100001000 + Occurrence(15).fingerprint('instance'), 0b111011100001000 ) self.assertEqual( - self.fp.fingerprint('instance', 32), + Occurrence(32).fingerprint('instance'), 0b11101110000100000000000000000000, ) self.assertEqual( - self.fp.fingerprint('instance', 64), + Occurrence(64).fingerprint('instance'), 0b11101110000100000000000000000000 << 32, ) diff --git a/tests/fingerprint/test_fingerprint_occurrence_halved.py b/tests/fingerprint/test_fingerprint_occurrence_halved.py index 2645a94e1..3c2240389 100644 --- a/tests/fingerprint/test_fingerprint_occurrence_halved.py +++ b/tests/fingerprint/test_fingerprint_occurrence_halved.py @@ -51,14 +51,14 @@ def test_occurrence_halved_fingerprint(self): self.assertEqual(self.fp.fingerprint('inst'), 0b0001000010100100) self.assertEqual( - self.fp.fingerprint('instance', 15), 0b0110010010111000 + OccurrenceHalved(15).fingerprint('instance'), 0b0110010010111000 ) self.assertEqual( - self.fp.fingerprint('instance', 32), + OccurrenceHalved(32).fingerprint('instance'), 0b01100100101110000000000100000000, ) self.assertEqual( - self.fp.fingerprint('instance', 64), + OccurrenceHalved(64).fingerprint('instance'), 0b01100100101110000000000100000000 << 32, ) diff --git a/tests/fingerprint/test_fingerprint_phonetic.py b/tests/fingerprint/test_fingerprint_phonetic.py index 09a169238..332219c0e 100644 --- a/tests/fingerprint/test_fingerprint_phonetic.py +++ b/tests/fingerprint/test_fingerprint_phonetic.py @@ -44,7 +44,8 @@ class PhoneticTestCases(unittest.TestCase): """ fp = Phonetic() - phonet = Phonet() + fp_phonet = Phonetic(Phonet()) + fp_soundex = Phonetic(Soundex()) soundex = Soundex() def test_phonetic_fingerprint(self): @@ -56,12 +57,12 @@ def test_phonetic_fingerprint(self): self.fp.fingerprint(' '.join(NIALL)), 'a anl mknl njl nklk nl' ) self.assertEqual( - self.fp.fingerprint(' '.join(NIALL), self.phonet.encode), + self.fp_phonet.fingerprint(' '.join(NIALL)), 'knile makneil maknele neil nel nele nial nigeli ' + 'nigl nil noigialach oneil ui', ) self.assertEqual( - self.fp.fingerprint(' '.join(NIALL), self.soundex.encode), + self.fp_soundex.fingerprint(' '.join(NIALL)), 'k540 m254 n240 n242 n400 o540 u000', ) diff --git a/tests/fingerprint/test_fingerprint_position.py b/tests/fingerprint/test_fingerprint_position.py index f91a6f44d..050706092 100644 --- a/tests/fingerprint/test_fingerprint_position.py +++ b/tests/fingerprint/test_fingerprint_position.py @@ -51,14 +51,14 @@ def test_position_fingerprint(self): self.assertEqual(self.fp.fingerprint('instance'), 0b1110111001110001) self.assertEqual( - self.fp.fingerprint('instance', 15), 0b111011100111000 + Position(15).fingerprint('instance'), 0b111011100111000 ) self.assertEqual( - self.fp.fingerprint('instance', 32), + Position(32).fingerprint('instance'), 0b11101110011100000101011111111111, ) self.assertEqual( - self.fp.fingerprint('instance', 64), 0xEE7057FFEFFFFFFF + Position(64).fingerprint('instance'), 0xEE7057FFEFFFFFFF ) # Test wrapper diff --git a/tests/fingerprint/test_fingerprint_qgram.py b/tests/fingerprint/test_fingerprint_qgram.py index 39aeea85e..ad6c8d508 100644 --- a/tests/fingerprint/test_fingerprint_qgram.py +++ b/tests/fingerprint/test_fingerprint_qgram.py @@ -63,10 +63,10 @@ def test_qgram_fingerprint(self): for i in range(len(self._testset)): self.assertEqual( - self.fp.fingerprint(self._testset[i], 1), self._anssetq1[i] + QGram(1).fingerprint(self._testset[i]), self._anssetq1[i] ) self.assertEqual( - self.fp.fingerprint(self._testset[i], 2), self._anssetq2[i] + QGram(2).fingerprint(self._testset[i]), self._anssetq2[i] ) self.assertEqual( self.fp.fingerprint(self._testset[i]), self._anssetq2[i] diff --git a/tests/fingerprint/test_fingerprint_synoname.py b/tests/fingerprint/test_fingerprint_synoname_toolcode.py similarity index 100% rename from tests/fingerprint/test_fingerprint_synoname.py rename to tests/fingerprint/test_fingerprint_synoname_toolcode.py diff --git a/tests/fuzz/fuzz_test_fingerprint.py b/tests/fuzz/fuzz_test_fingerprint.py index cfe914401..12eae6186 100644 --- a/tests/fuzz/fuzz_test_fingerprint.py +++ b/tests/fuzz/fuzz_test_fingerprint.py @@ -48,31 +48,20 @@ from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char -string = String() -qgram = QGram() -phonetic = Phonetic() -skeleton = SkeletonKey() -omission = OmissionKey() -occurrence = Occurrence() -occurrence_halved = OccurrenceHalved() -count = Count() -position = Position() synoname = SynonameToolcode() algorithms = { - 'str_fingerprint': string.fingerprint, - 'qgram_fingerprint': qgram.fingerprint, - 'qgram_fingerprint_3': lambda _: qgram.fingerprint(_, qval=3), - 'qgram_fingerprint_ssj': lambda _: qgram.fingerprint( - _, start_stop='$#', joiner=' ' - ), - 'phonetic_fingerprint': phonetic.fingerprint, - 'skeleton_key': skeleton.fingerprint, - 'omission_key': omission.fingerprint, - 'occurrence_fingerprint': occurrence.fingerprint, - 'occurrence_halved_fingerprint': occurrence_halved.fingerprint, - 'count_fingerprint': count.fingerprint, - 'position_fingerprint': position.fingerprint, + 'str_fingerprint': String().fingerprint, + 'qgram_fingerprint': QGram().fingerprint, + 'qgram_fingerprint_3': QGram(qval=3).fingerprint, + 'qgram_fingerprint_ssj': QGram(start_stop='$#', joiner=' ').fingerprint, + 'phonetic_fingerprint': Phonetic().fingerprint, + 'skeleton_key': SkeletonKey().fingerprint, + 'omission_key': OmissionKey().fingerprint, + 'occurrence_fingerprint': Occurrence().fingerprint, + 'occurrence_halved_fingerprint': OccurrenceHalved().fingerprint, + 'count_fingerprint': Count().fingerprint, + 'position_fingerprint': Position().fingerprint, 'synoname_toolcode': synoname.fingerprint, 'synoname_toolcode_2name': lambda _: synoname.fingerprint(_, _), } diff --git a/tests/fuzz/fuzz_test_phonetic.py b/tests/fuzz/fuzz_test_phonetic.py index 2892c9a96..2d47ba960 100644 --- a/tests/fuzz/fuzz_test_phonetic.py +++ b/tests/fuzz/fuzz_test_phonetic.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 by Christopher C. Little. +# Copyright 2018-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -46,7 +46,7 @@ Haase, HenryEarly, Koelner, - Lein, + LEIN, MRA, MetaSoundex, Metaphone, @@ -77,164 +77,101 @@ from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char -alpha_sis = AlphaSIS() -bm = BeiderMorse() -caverphone = Caverphone() -davidson = Davidson() -dm = DaitchMokotoff() -dolby = Dolby() -double_metaphone = DoubleMetaphone() -eudex = Eudex() -fonem = FONEM() -fuzzy_soundex = FuzzySoundex() -haase = Haase() -henry_early = HenryEarly() -koelner = Koelner() -lein = Lein() -metaphone = Metaphone() -metasoundex = MetaSoundex() -mra = MRA() -norphone = Norphone() -nrl = NRL() -nysiis = NYSIIS() -onca = ONCA() -parmar_kumbharana = ParmarKumbharana() -phonem = Phonem() -phonet = Phonet() -phonetic_spanish = PhoneticSpanish() -phonex = Phonex() -phonix = Phonix() -pshp_soundex_first = PSHPSoundexFirst() -pshp_soundex_last = PSHPSoundexLast() -refined_soundex = RefinedSoundex() -reth_schek = RethSchek() -roger_root = RogerRoot() russell = RussellIndex() -sfinxbis = SfinxBis() -sound_d = SoundD() -soundex = Soundex() -soundex_br = SoundexBR() -spanish_metaphone = SpanishMetaphone() +koelner = Koelner() spfc = SPFC() -statistics_canada = StatisticsCanada() algorithms = { - 'russell_index': lambda _: str(russell.encode(_)), + 'russell_index': russell.encode, 'russell_index_num_to_alpha': lambda _: russell._to_alpha( # noqa: SF01 russell.encode(_) ), 'russell_index_alpha': russell.encode_alpha, - 'soundex': soundex.encode, - 'reverse_soundex': lambda _: soundex.encode(_, reverse=True), - 'soundex_0pad_ml6': lambda _: soundex.encode( - _, zero_pad=True, max_length=6 - ), - 'soundex_special': lambda _: soundex.encode(_, var='special'), - 'soundex_census': lambda _: ', '.join(soundex.encode(_, var='Census')), - 'refined_soundex': refined_soundex.encode, - 'refined_soundex_vowels': lambda _: refined_soundex.encode( - _, retain_vowels=True - ), - 'refined_soundex_0pad_ml6': lambda _: refined_soundex.encode( - _, zero_pad=True, max_length=6 - ), - 'dm_soundex': lambda _: ', '.join(sorted(dm.encode(_))), + 'soundex': Soundex().encode, + 'reverse_soundex': Soundex(reverse=True).encode, + 'soundex_0pad_ml6': Soundex(zero_pad=True, max_length=6).encode, + 'soundex_special': Soundex(var='special').encode, + 'soundex_census': Soundex(var='Census').encode, + 'refined_soundex': RefinedSoundex().encode, + 'refined_soundex_vowels': RefinedSoundex(retain_vowels=True).encode, + 'refined_soundex_0pad_ml6': RefinedSoundex( + zero_pad=True, max_length=6 + ).encode, + 'daitch_mokotoff_soundex': DaitchMokotoff().encode, 'koelner_phonetik': koelner.encode, 'koelner_phonetik_num_to_alpha': lambda _: koelner._to_alpha( # noqa: SF01 koelner.encode(_) ), 'koelner_phonetik_alpha': koelner.encode_alpha, - 'nysiis': nysiis.encode, - 'nysiis_modified': lambda _: nysiis.encode(_, modified=True), - 'nysiis_ml_inf': lambda _: nysiis.encode(_, max_length=-1), - 'mra': mra.encode, - 'metaphone': metaphone.encode, - 'double_metaphone': lambda _: ', '.join(double_metaphone.encode(_)), - 'caverphone_1': lambda _: caverphone.encode(_, version=1), - 'caverphone_2': caverphone.encode, - 'alpha_sis': lambda _: ', '.join(alpha_sis.encode(_)), - 'fuzzy_soundex': fuzzy_soundex.encode, - 'fuzzy_soundex_0pad_ml8': lambda _: fuzzy_soundex.encode( - _, max_length=8, zero_pad=True - ), - 'phonex': phonex.encode, - 'phonex_0pad_ml6': lambda _: phonex.encode(_, max_length=6, zero_pad=True), - 'phonem': phonem.encode, - 'phonix': phonix.encode, - 'phonix_0pad_ml6': lambda _: phonix.encode(_, max_length=6, zero_pad=True), - 'sfinxbis': lambda _: ', '.join(sfinxbis.encode(_)), - 'sfinxbis_ml6': lambda _: ', '.join(sfinxbis.encode(_, max_length=6)), - 'phonet_1': phonet.encode, - 'phonet_2': lambda _: phonet.encode(_, mode=2), - 'phonet_1_none': lambda _: phonet.encode(_, lang='none'), - 'phonet_2_none': lambda _: phonet.encode(_, mode=2, lang='none'), + 'nysiis': NYSIIS().encode, + 'nysiis_modified': NYSIIS(modified=True).encode, + 'nysiis_ml_inf': NYSIIS(max_length=-1).encode, + 'mra': MRA().encode, + 'metaphone': Metaphone().encode, + 'double_metaphone': DoubleMetaphone().encode, + 'caverphone_1': Caverphone(version=1).encode, + 'caverphone_2': Caverphone().encode, + 'alpha_sis': AlphaSIS().encode, + 'fuzzy_soundex': FuzzySoundex().encode, + 'fuzzy_soundex_0pad_ml8': FuzzySoundex(max_length=8, zero_pad=True).encode, + 'phonex': Phonex().encode, + 'phonex_0pad_ml6': Phonex(max_length=6, zero_pad=True).encode, + 'phonem': Phonem().encode, + 'phonix': Phonix().encode, + 'phonix_0pad_ml6': Phonix(max_length=6, zero_pad=True).encode, + 'sfinxbis': SfinxBis().encode, + 'sfinxbis_ml6': SfinxBis(max_length=6).encode, + 'phonet_1': Phonet().encode, + 'phonet_2': Phonet(mode=2).encode, + 'phonet_1_none': Phonet(lang='none').encode, + 'phonet_2_none': Phonet(mode=2, lang='none').encode, 'spfc': lambda _: spfc.encode(_ + ' ' + _), - 'statistics_canada': statistics_canada.encode, - 'statistics_canada_ml8': lambda _: statistics_canada.encode( - _, max_length=8 - ), - 'lein': lein.encode, - 'lein_nopad_ml8': lambda _: lein.encode(_, max_length=8, zero_pad=False), - 'roger_root': roger_root.encode, - 'roger_root_nopad_ml8': lambda _: roger_root.encode( - _, max_length=8, zero_pad=False - ), - 'onca': onca.encode, - 'onca_nopad_ml8': lambda _: onca.encode(_, max_length=8, zero_pad=False), - 'eudex': lambda _: str(eudex.encode(_)), - 'haase_phonetik': lambda _: ', '.join(haase.encode(_)), - 'haase_phonetik_primary': lambda _: haase.encode(_, primary_only=True)[0], - 'reth_schek_phonetik': reth_schek.encode, - 'fonem': fonem.encode, - 'parmar_kumbharana': parmar_kumbharana.encode, - 'davidson': davidson.encode, - 'sound_d': sound_d.encode, - 'sound_d_ml8': lambda _: sound_d.encode(_, max_length=8), - 'pshp_soundex_last': pshp_soundex_last.encode, - 'pshp_soundex_last_german': lambda _: pshp_soundex_last.encode( - _, german=True - ), - 'pshp_soundex_last_ml8': lambda _: pshp_soundex_last.encode( - _, max_length=8 - ), - 'pshp_soundex_first': pshp_soundex_first.encode, - 'pshp_soundex_first_german': lambda _: pshp_soundex_first.encode( - _, german=True - ), - 'pshp_soundex_first_ml8': lambda _: pshp_soundex_first.encode( - _, max_length=8 - ), - 'henry_early': henry_early.encode, - 'henry_early_ml8': lambda _: henry_early.encode(_, max_length=8), - 'norphone': norphone.encode, - 'dolby': dolby.encode, - 'dolby_ml4': lambda _: dolby.encode(_, max_length=4), - 'dolby_vowels': lambda _: dolby.encode(_, keep_vowels=True), - 'phonetic_spanish': phonetic_spanish.encode, - 'phonetic_spanish_ml4': lambda _: phonetic_spanish.encode(_, max_length=4), - 'spanish_metaphone': spanish_metaphone.encode, - 'spanish_metaphone_modified': lambda _: spanish_metaphone.encode( - _, modified=True - ), - 'spanish_metaphone_ml4': lambda _: spanish_metaphone.encode( - _, max_length=4 - ), - 'metasoundex': metasoundex.encode, - 'metasoundex_es': lambda _: metasoundex.encode(_, lang='es'), - 'soundex_br': soundex_br.encode, - 'nrl': nrl.encode, - 'bmpm': bm.encode, - 'bmpm_german': lambda _: bm.encode(_, language_arg='german'), - 'bmpm_french': lambda _: bm.encode(_, language_arg='french'), - 'bmpm_gen_exact': lambda _: bm.encode(_, match_mode='exact'), - 'bmpm_ash_approx': lambda _: bm.encode(_, name_mode='ash'), - 'bmpm_ash_exact': lambda _: bm.encode( - _, name_mode='ash', match_mode='exact' - ), - 'bmpm_sep_approx': lambda _: bm.encode(_, name_mode='sep'), - 'bmpm_sep_exact': lambda _: bm.encode( - _, name_mode='sep', match_mode='exact' - ), + 'statistics_canada': StatisticsCanada().encode, + 'statistics_canada_ml8': StatisticsCanada(max_length=8).encode, + 'lein': LEIN().encode, + 'lein_nopad_ml8': LEIN(max_length=8, zero_pad=False).encode, + 'roger_root': RogerRoot().encode, + 'roger_root_nopad_ml8': RogerRoot(max_length=8, zero_pad=False).encode, + 'onca': ONCA().encode, + 'onca_nopad_ml8': ONCA(max_length=8, zero_pad=False).encode, + 'eudex': Eudex().encode, + 'haase_phonetik': Haase().encode, + 'haase_phonetik_primary': Haase(primary_only=True).encode, + 'reth_schek_phonetik': RethSchek().encode, + 'fonem': FONEM().encode, + 'parmar_kumbharana': ParmarKumbharana().encode, + 'davidson': Davidson().encode, + 'sound_d': SoundD().encode, + 'sound_d_ml8': SoundD(max_length=8).encode, + 'pshp_soundex_last': PSHPSoundexLast().encode, + 'pshp_soundex_last_german': PSHPSoundexLast(german=True).encode, + 'pshp_soundex_last_ml8': PSHPSoundexLast(max_length=8).encode, + 'pshp_soundex_first': PSHPSoundexFirst().encode, + 'pshp_soundex_first_german': PSHPSoundexFirst(german=True).encode, + 'pshp_soundex_first_ml8': PSHPSoundexFirst(max_length=8).encode, + 'henry_early': HenryEarly().encode, + 'henry_early_ml8': HenryEarly(max_length=8).encode, + 'norphone': Norphone().encode, + 'dolby': Dolby().encode, + 'dolby_ml4': Dolby(max_length=4).encode, + 'dolby_vowels': Dolby(keep_vowels=True).encode, + 'phonetic_spanish': PhoneticSpanish().encode, + 'phonetic_spanish_ml4': PhoneticSpanish(max_length=4).encode, + 'spanish_metaphone': SpanishMetaphone().encode, + 'spanish_metaphone_modified': SpanishMetaphone(modified=True).encode, + 'spanish_metaphone_ml4': SpanishMetaphone(max_length=4).encode, + 'metasoundex': MetaSoundex().encode, + 'metasoundex_es': MetaSoundex(lang='es').encode, + 'soundex_br': SoundexBR().encode, + 'nrl': NRL().encode, + 'bmpm': BeiderMorse().encode, + 'bmpm_german': BeiderMorse(language_arg='german').encode, + 'bmpm_french': BeiderMorse(language_arg='french').encode, + 'bmpm_gen_exact': BeiderMorse(match_mode='exact').encode, + 'bmpm_ash_approx': BeiderMorse(name_mode='ash').encode, + 'bmpm_ash_exact': BeiderMorse(name_mode='ash', match_mode='exact').encode, + 'bmpm_sep_approx': BeiderMorse(name_mode='sep').encode, + 'bmpm_sep_exact': BeiderMorse(name_mode='sep', match_mode='exact').encode, } diff --git a/tests/phonetic/test_phonetic_alpha_sis.py b/tests/phonetic/test_phonetic_alpha_sis.py index baa27939a..30f673033 100644 --- a/tests/phonetic/test_phonetic_alpha_sis.py +++ b/tests/phonetic/test_phonetic_alpha_sis.py @@ -63,11 +63,16 @@ def test_alpha_sis_encode(self): # max_length bounds tests self.assertEqual( - self.pa.encode('Niall', max_length=-1)[0], - '02500000000000000000000000000000000000000000000000' - + '00000000000000', + AlphaSIS(max_length=-1).encode('Niall')[0], + '0250000000000000000000000000000000000000000000000000000000000000', ) - self.assertEqual(self.pa.encode('Niall', max_length=0)[0], '0250') + self.assertEqual(AlphaSIS(max_length=0).encode('Niall')[0], '0250') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Rogers')[0], 'RKR') + self.assertEqual(self.pa.encode_alpha('Kant')[0], 'KNT') + self.assertEqual(self.pa.encode_alpha('Knuth')[0], 'NT') + self.assertEqual(self.pa.encode_alpha('Harper')[0], 'HRPR') # Test wrapper self.assertEqual(alpha_sis('Livingston')[0], '05827012000000') diff --git a/tests/phonetic/test_phonetic_beider_morse.py b/tests/phonetic/test_phonetic_beider_morse.py index 5f9ea1c56..e946695d1 100644 --- a/tests/phonetic/test_phonetic_beider_morse.py +++ b/tests/phonetic/test_phonetic_beider_morse.py @@ -82,7 +82,7 @@ def test_beider_morse_encode(self): particularly in terms of formatting and ordering. """ # base cases - self.assertEqual(self.pa.encode(''), '') + self.assertEqual(BeiderMorse().encode(''), '') for langs in ('', 1, 'spanish', 'english,italian', 3): for name_mode in ('gen', 'ash', 'sep'): @@ -94,8 +94,7 @@ def test_beider_morse_encode(self): ): self.assertRaises( ValueError, - self.pa.encode, - '', + BeiderMorse, langs, name_mode, match_mode, @@ -103,58 +102,62 @@ def test_beider_morse_encode(self): ) else: self.assertEqual( - self.pa.encode( - '', langs, name_mode, match_mode, concat - ), + BeiderMorse( + langs, name_mode, match_mode, concat + ).encode(''), '', ) # testSolrGENERIC # concat is true, ruleType is EXACT self.assertEqual( - self.pa.encode('Angelo', '', 'gen', 'exact', True), + BeiderMorse('', 'gen', 'exact', True).encode('Angelo'), 'angelo anxelo anhelo anjelo anZelo andZelo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'gen', 'exact', True), + BeiderMorse('', 'gen', 'exact', True).encode("D'Angelo"), 'angelo anxelo anhelo anjelo anZelo andZelo dangelo' + ' danxelo danhelo danjelo danZelo dandZelo', ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'gen', 'exact', True + BeiderMorse('italian,greek,spanish', 'gen', 'exact', True).encode( + 'Angelo' ), 'angelo anxelo andZelo', ) - self.assertEqual(self.pa.encode('1234', '', 'gen', 'exact', True), '') + self.assertEqual( + BeiderMorse('', 'gen', 'exact', True).encode('1234'), '' + ) # concat is false, ruleType is EXACT self.assertEqual( - self.pa.encode('Angelo', '', 'gen', 'exact', False), + BeiderMorse('', 'gen', 'exact', False).encode('Angelo'), 'angelo anxelo anhelo anjelo anZelo andZelo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'gen', 'exact', False), + BeiderMorse('', 'gen', 'exact', False).encode("D'Angelo"), 'angelo anxelo anhelo anjelo anZelo andZelo dangelo' + ' danxelo danhelo danjelo danZelo dandZelo', ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'gen', 'exact', False + BeiderMorse('italian,greek,spanish', 'gen', 'exact', False).encode( + 'Angelo' ), 'angelo anxelo andZelo', ) - self.assertEqual(self.pa.encode('1234', '', 'gen', 'exact', False), '') + self.assertEqual( + BeiderMorse('', 'gen', 'exact', False).encode('1234'), '' + ) # concat is true, ruleType is APPROX self.assertEqual( - self.pa.encode('Angelo', '', 'gen', 'approx', True), + BeiderMorse('', 'gen', 'approx', True).encode('Angelo'), 'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + ' anzilo onzilo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'gen', 'approx', True), + BeiderMorse('', 'gen', 'approx', True).encode("D'Angelo"), 'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + ' anzilo onzilo dangilo dangYlo dagilo dongilo' @@ -162,22 +165,24 @@ def test_beider_morse_encode(self): + ' danilo donilo daniilo doniilo danzilo donzilo', ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'gen', 'approx', True + BeiderMorse('italian,greek,spanish', 'gen', 'approx', True).encode( + 'Angelo' ), 'angilo ongilo anxilo onxilo anzilo onzilo', ) - self.assertEqual(self.pa.encode('1234', '', 'gen', 'approx', True), '') + self.assertEqual( + BeiderMorse('', 'gen', 'approx', True).encode('1234'), '' + ) # concat is false, ruleType is APPROX self.assertEqual( - self.pa.encode('Angelo', '', 'gen', 'approx', False), + BeiderMorse('', 'gen', 'approx', False).encode('Angelo'), 'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + ' anzilo onzilo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'gen', 'approx', False), + BeiderMorse('', 'gen', 'approx', False).encode("D'Angelo"), 'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + ' anzilo onzilo dangilo dangYlo dagilo dongilo' @@ -185,275 +190,279 @@ def test_beider_morse_encode(self): + ' danilo donilo daniilo doniilo danzilo donzilo', ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'gen', 'approx', False - ), + BeiderMorse( + 'italian,greek,spanish', 'gen', 'approx', False + ).encode('Angelo'), 'angilo ongilo anxilo onxilo anzilo onzilo', ) self.assertEqual( - self.pa.encode('1234', '', 'gen', 'approx', False), '' + BeiderMorse('', 'gen', 'approx', False).encode('1234'), '' ) # testSolrASHKENAZI # concat is true, ruleType is EXACT self.assertEqual( - self.pa.encode('Angelo', '', 'ash', 'exact', True), + BeiderMorse('', 'ash', 'exact', True).encode('Angelo'), 'angelo andZelo anhelo anxelo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'ash', 'exact', True), + BeiderMorse('', 'ash', 'exact', True).encode("D'Angelo"), 'dangelo dandZelo danhelo danxelo', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'ash', 'exact', True, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'ash', 'exact', True, True - ), + BeiderMorse( + 'italian,greek,spanish', 'ash', 'exact', True, True + ).encode('Angelo'), 'anxelo angelo', ) - self.assertEqual(self.pa.encode('1234', '', 'ash', 'exact', True), '') + self.assertEqual( + BeiderMorse('', 'ash', 'exact', True).encode('1234'), '' + ) # concat is false, ruleType is EXACT self.assertEqual( - self.pa.encode('Angelo', '', 'ash', 'exact', False), + BeiderMorse('', 'ash', 'exact', False).encode('Angelo'), 'angelo andZelo anhelo anxelo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'ash', 'exact', False), + BeiderMorse('', 'ash', 'exact', False).encode("D'Angelo"), 'dangelo dandZelo danhelo danxelo', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'ash', 'exact', False, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'ash', 'exact', False, True - ), + BeiderMorse( + 'italian,greek,spanish', 'ash', 'exact', False, True + ).encode('Angelo'), 'anxelo angelo', ) - self.assertEqual(self.pa.encode('1234', '', 'ash', 'exact', False), '') + self.assertEqual( + BeiderMorse('', 'ash', 'exact', False).encode('1234'), '' + ) # concat is true, ruleType is APPROX self.assertEqual( - self.pa.encode('Angelo', '', 'ash', 'approx', True), + BeiderMorse('', 'ash', 'approx', True).encode('Angelo'), 'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' + ' onzilo anilo onilo anxilo onxilo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'ash', 'approx', True), + BeiderMorse('', 'ash', 'approx', True).encode("D'Angelo"), 'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' + ' danzilo donzilo danilo donilo danxilo donxilo', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'ash', 'approx', True, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'ash', 'approx', True, True - ), + BeiderMorse( + 'italian,greek,spanish', 'ash', 'approx', True, True + ).encode('Angelo'), 'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' + ' ongilo', ) - self.assertEqual(self.pa.encode('1234', '', 'ash', 'approx', True), '') + self.assertEqual( + BeiderMorse('', 'ash', 'approx', True).encode('1234'), '' + ) # concat is false, ruleType is APPROX self.assertEqual( - self.pa.encode('Angelo', '', 'ash', 'approx', False), + BeiderMorse('', 'ash', 'approx', False).encode('Angelo'), 'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' + ' onzilo anilo onilo anxilo onxilo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'ash', 'approx', False), + BeiderMorse('', 'ash', 'approx', False).encode("D'Angelo"), 'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' + ' danzilo donzilo danilo donilo danxilo donxilo', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'ash', 'approx', False, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'ash', 'approx', False, True - ), + BeiderMorse( + 'italian,greek,spanish', 'ash', 'approx', False, True + ).encode('Angelo'), 'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' + ' ongilo', ) self.assertEqual( - self.pa.encode('1234', '', 'ash', 'approx', False), '' + BeiderMorse('', 'ash', 'approx', False).encode('1234'), '' ) # testSolrSEPHARDIC # concat is true, ruleType is EXACT self.assertEqual( - self.pa.encode('Angelo', '', 'sep', 'exact', True), + BeiderMorse('', 'sep', 'exact', True).encode('Angelo'), 'anZelo andZelo anxelo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'sep', 'exact', True), + BeiderMorse('', 'sep', 'exact', True).encode("D'Angelo"), 'anZelo andZelo anxelo', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'sep', 'exact', True, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'sep', 'exact', True, True - ), + BeiderMorse( + 'italian,greek,spanish', 'sep', 'exact', True, True + ).encode('Angelo'), 'andZelo anxelo', ) - self.assertEqual(self.pa.encode('1234', '', 'sep', 'exact', True), '') + self.assertEqual( + BeiderMorse('', 'sep', 'exact', True).encode('1234'), '' + ) # concat is false, ruleType is EXACT self.assertEqual( - self.pa.encode('Angelo', '', 'sep', 'exact', False), + BeiderMorse('', 'sep', 'exact', False).encode('Angelo'), 'anZelo andZelo anxelo', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'sep', 'exact', False), + BeiderMorse('', 'sep', 'exact', False).encode("D'Angelo"), 'anZelo andZelo anxelo', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'sep', 'exact', False, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'sep', 'exact', False, True - ), + BeiderMorse( + 'italian,greek,spanish', 'sep', 'exact', False, True + ).encode('Angelo'), 'andZelo anxelo', ) - self.assertEqual(self.pa.encode('1234', '', 'sep', 'exact', False), '') + self.assertEqual( + BeiderMorse('', 'sep', 'exact', False).encode('1234'), '' + ) # concat is true, ruleType is APPROX self.assertEqual( - self.pa.encode('Angelo', '', 'sep', 'approx', True), + BeiderMorse('', 'sep', 'approx', True).encode('Angelo'), 'anzila anzilu nzila nzilu anhila anhilu nhila nhilu', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'sep', 'approx', True), + BeiderMorse('', 'sep', 'approx', True).encode("D'Angelo"), 'anzila anzilu nzila nzilu anhila anhilu nhila nhilu', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'sep', 'approx', True, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'sep', 'approx', True, True - ), + BeiderMorse( + 'italian,greek,spanish', 'sep', 'approx', True, True + ).encode('Angelo'), 'anzila anzilu nzila nzilu anhila anhilu nhila nhilu', ) - self.assertEqual(self.pa.encode('1234', '', 'sep', 'approx', True), '') + self.assertEqual( + BeiderMorse('', 'sep', 'approx', True).encode('1234'), '' + ) # concat is false, ruleType is APPROX self.assertEqual( - self.pa.encode('Angelo', '', 'sep', 'approx', False), + BeiderMorse('', 'sep', 'approx', False).encode('Angelo'), 'anzila anzilu nzila nzilu anhila anhilu nhila nhilu', ) self.assertEqual( - self.pa.encode('D\'Angelo', '', 'sep', 'approx', False), + BeiderMorse('', 'sep', 'approx', False).encode("D'Angelo"), 'anzila anzilu nzila nzilu anhila anhilu nhila nhilu', ) self.assertRaises( ValueError, - self.pa.encode, - 'Angelo', + BeiderMorse, 'italian,greek,spanish', 'sep', 'approx', False, ) self.assertEqual( - self.pa.encode( - 'Angelo', 'italian,greek,spanish', 'sep', 'approx', False, True - ), + BeiderMorse( + 'italian,greek,spanish', 'sep', 'approx', False, True + ).encode('Angelo'), 'anzila anzilu nzila nzilu anhila anhilu nhila nhilu', ) self.assertEqual( - self.pa.encode('1234', '', 'sep', 'approx', False), '' + BeiderMorse('', 'sep', 'approx', False).encode('1234'), '' ) # testCompatibilityWithOriginalVersion self.assertEqual( - self.pa.encode('abram', '', 'gen', 'approx', False), + BeiderMorse('', 'gen', 'approx', False).encode('abram'), 'abram abrom avram avrom obram obrom ovram ovrom' + ' Ybram Ybrom abran abron obran obron', ) self.assertEqual( - self.pa.encode('Bendzin', '', 'gen', 'approx', False), + BeiderMorse('', 'gen', 'approx', False).encode('Bendzin'), 'binzn bindzn vindzn bintsn vintsn', ) self.assertEqual( - self.pa.encode('abram', '', 'ash', 'approx', False), + BeiderMorse('', 'ash', 'approx', False).encode('abram'), 'abram abrom avram avrom obram obrom ovram ovrom' + ' Ybram Ybrom ombram ombrom imbram imbrom', ) self.assertEqual( - self.pa.encode('Halpern', '', 'ash', 'approx', False), + BeiderMorse('', 'ash', 'approx', False).encode('Halpern'), 'alpirn alpYrn olpirn olpYrn Ylpirn YlpYrn xalpirn' + ' xolpirn', ) # PhoneticEngineTest self.assertEqual( - self.pa.encode('Renault', '', 'gen', 'approx', True), + BeiderMorse('', 'gen', 'approx', True).encode('Renault'), 'rinolt rino rinDlt rinalt rinult rinD rina rinu', ) self.assertEqual( - self.pa.encode('Renault', '', 'ash', 'approx', True), + BeiderMorse('', 'ash', 'approx', True).encode('Renault'), 'rinDlt rinalt rinult rYnDlt rYnalt rYnult rinolt', ) self.assertEqual( - self.pa.encode('Renault', '', 'sep', 'approx', True), 'rinDlt' + BeiderMorse('', 'sep', 'approx', True).encode('Renault'), 'rinDlt' ) self.assertEqual( - self.pa.encode('SntJohn-Smith', '', 'gen', 'exact', True), + BeiderMorse('', 'gen', 'exact', True).encode('SntJohn-Smith'), 'sntjonsmit', ) self.assertEqual( - self.pa.encode('d\'ortley', '', 'gen', 'exact', True), + BeiderMorse('', 'gen', 'exact', True).encode("d'ortley"), 'ortlaj ortlej dortlaj dortlej', ) self.assertEqual( - self.pa.encode('van helsing', '', 'gen', 'exact', False), + BeiderMorse('', 'gen', 'exact', False).encode('van helsing'), 'helSink helsink helzink xelsink elSink elsink' + ' vanhelsink vanhelzink vanjelsink fanhelsink' + ' fanhelzink banhelsink', @@ -473,19 +482,19 @@ def test_beider_morse_encode_misc(self): """ # test of Ashkenazi with discardable prefix self.assertEqual( - self.pa.encode('bar Hayim', name_mode='ash'), 'Dm xDm' + BeiderMorse(name_mode='ash').encode('bar Hayim'), 'Dm xDm' ) # tests of concat behavior self.assertEqual( - self.pa.encode('Rodham Clinton', concat=False), + BeiderMorse(concat=False).encode('Rodham Clinton'), 'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + ' rodxan rodxon rudam rudom klinton klnton klintun' + ' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + ' zlnton', ) self.assertEqual( - self.pa.encode('Rodham Clinton', concat=True), + BeiderMorse(concat=True).encode('Rodham Clinton'), 'rodamklinton rodomklinton rodamklnton rodomklnton' + ' rodamklintun rodomklintun rodamklntun rodomklntun' + ' rodamtzlinton rodomtzlinton rodamtzlnton' @@ -505,50 +514,50 @@ def test_beider_morse_encode_misc(self): # tests of name_mode values self.assertEqual( - self.pa.encode('bar Hayim', name_mode='ash'), 'Dm xDm' + BeiderMorse(name_mode='ash').encode('bar Hayim'), 'Dm xDm' ) self.assertEqual( - self.pa.encode('bar Hayim', name_mode='ashkenazi'), 'Dm xDm' + BeiderMorse(name_mode='ashkenazi').encode('bar Hayim'), 'Dm xDm' ) self.assertEqual( - self.pa.encode('bar Hayim', name_mode='Ashkenazi'), 'Dm xDm' + BeiderMorse(name_mode='Ashkenazi').encode('bar Hayim'), 'Dm xDm' ) self.assertEqual( - self.pa.encode('bar Hayim', name_mode='gen', concat=True), + BeiderMorse(name_mode='gen', concat=True).encode('bar Hayim'), 'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + ' borxDm varxDm vorxDm barxDn borxDn', ) self.assertEqual( - self.pa.encode('bar Hayim', name_mode='general', concat=True), + BeiderMorse(name_mode='general', concat=True).encode('bar Hayim'), 'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + ' borxDm varxDm vorxDm barxDn borxDn', ) self.assertEqual( - self.pa.encode('bar Hayim', name_mode='Mizrahi', concat=True), + BeiderMorse(name_mode='Mizrahi', concat=True).encode('bar Hayim'), 'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + ' borxDm varxDm vorxDm barxDn borxDn', ) self.assertEqual( - self.pa.encode('bar Hayim', name_mode='mizrahi', concat=True), + BeiderMorse(name_mode='mizrahi', concat=True).encode('bar Hayim'), 'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + ' borxDm varxDm vorxDm barxDn borxDn', ) self.assertEqual( - self.pa.encode('bar Hayim', name_mode='miz', concat=True), + BeiderMorse(name_mode='miz', concat=True).encode('bar Hayim'), 'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + ' borxDm varxDm vorxDm barxDn borxDn', ) # test that out-of-range language_arg results in L_ANY self.assertEqual( - self.pa.encode('Rodham Clinton', language_arg=2 ** 32), + BeiderMorse(language_arg=2 ** 32).encode('Rodham Clinton'), 'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + ' rodxan rodxon rudam rudom klinton klnton klintun' + ' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + ' zlnton', ) self.assertEqual( - self.pa.encode('Rodham Clinton', language_arg=-4), + BeiderMorse(language_arg=-4).encode('Rodham Clinton'), 'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + ' rodxan rodxon rudam rudom klinton klnton klintun' + ' klntun tzlinton tzlnton tzlintun tzlntun zlinton' @@ -557,7 +566,7 @@ def test_beider_morse_encode_misc(self): # etc. (for code coverage) self.assertEqual( - self.pa.encode('van Damme', name_mode='sep'), 'dami mi dam m' + BeiderMorse(name_mode='sep').encode('van Damme'), 'dami mi dam m' ) def test_beider_morse_encode_nachnamen(self): @@ -574,10 +583,12 @@ def test_beider_morse_encode_nachnamen(self): # so let's just randomly select about 20 for testing if nn_line[0] != '#' and _one_in(500): self.assertEqual( - self.pa.encode(nn_line[0], language_arg='german'), + BeiderMorse(language_arg='german').encode(nn_line[0]), nn_line[1], ) - self.assertEqual(self.pa.encode(nn_line[0]), nn_line[2]) + self.assertEqual( + BeiderMorse().encode(nn_line[0]), nn_line[2] + ) def test_beider_morse_encode_nachnamen_cc(self): """Test abydos.phonetic.BeiderMorse (Nachnamen, corner cases).""" @@ -591,10 +602,12 @@ def test_beider_morse_encode_nachnamen_cc(self): # so let's just randomly select about 20 for testing if nn_line[0] != '#': self.assertEqual( - self.pa.encode(nn_line[0], language_arg='german'), + BeiderMorse(language_arg='german').encode(nn_line[0]), nn_line[1], ) - self.assertEqual(self.pa.encode(nn_line[0]), nn_line[2]) + self.assertEqual( + BeiderMorse().encode(nn_line[0]), nn_line[2] + ) def test_beider_morse_encode_uscensus2000(self): """Test abydos.phonetic.BeiderMorse (US Census 2000 set).""" @@ -608,39 +621,39 @@ def test_beider_morse_encode_uscensus2000(self): # so let's just randomly select about 20 for testing if cen_line[0] != '#' and _one_in(7500): self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='approx', name_mode='gen' - ), + BeiderMorse( + match_mode='approx', name_mode='gen' + ).encode(cen_line[0]), cen_line[1], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='approx', name_mode='ash' - ), + BeiderMorse( + match_mode='approx', name_mode='ash' + ).encode(cen_line[0]), cen_line[2], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='approx', name_mode='sep' - ), + BeiderMorse( + match_mode='approx', name_mode='sep' + ).encode(cen_line[0]), cen_line[3], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='exact', name_mode='gen' - ), + BeiderMorse( + match_mode='exact', name_mode='gen' + ).encode(cen_line[0]), cen_line[4], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='exact', name_mode='ash' - ), + BeiderMorse( + match_mode='exact', name_mode='ash' + ).encode(cen_line[0]), cen_line[5], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='exact', name_mode='sep' - ), + BeiderMorse( + match_mode='exact', name_mode='sep' + ).encode(cen_line[0]), cen_line[6], ) @@ -654,39 +667,39 @@ def test_beider_morse_encode_uscensus2000_cc(self): # so let's just randomly select about 20 for testing if cen_line[0] != '#' and _one_in(10): self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='approx', name_mode='gen' - ), + BeiderMorse( + match_mode='approx', name_mode='gen' + ).encode(cen_line[0]), cen_line[1], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='approx', name_mode='ash' - ), + BeiderMorse( + match_mode='approx', name_mode='ash' + ).encode(cen_line[0]), cen_line[2], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='approx', name_mode='sep' - ), + BeiderMorse( + match_mode='approx', name_mode='sep' + ).encode(cen_line[0]), cen_line[3], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='exact', name_mode='gen' - ), + BeiderMorse( + match_mode='exact', name_mode='gen' + ).encode(cen_line[0]), cen_line[4], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='exact', name_mode='ash' - ), + BeiderMorse( + match_mode='exact', name_mode='ash' + ).encode(cen_line[0]), cen_line[5], ) self.assertEqual( - self.pa.encode( - cen_line[0], match_mode='exact', name_mode='sep' - ), + BeiderMorse( + match_mode='exact', name_mode='sep' + ).encode(cen_line[0]), cen_line[6], ) @@ -928,6 +941,10 @@ def test_beider_morse_normalize_lang_attrs(self): 'abc', ) + self.assertEqual( + self.pa._language_index_from_code(0, 'gen'), L_ANY # noqa: SF01 + ) + if __name__ == '__main__': unittest.main() diff --git a/tests/phonetic/test_phonetic_caverphone.py b/tests/phonetic/test_phonetic_caverphone.py index 761b61d57..a1aa064ca 100644 --- a/tests/phonetic/test_phonetic_caverphone.py +++ b/tests/phonetic/test_phonetic_caverphone.py @@ -42,12 +42,13 @@ class CaverphoneTestCases(unittest.TestCase): """ pa = Caverphone() + pa_1 = Caverphone(version=1) + pa_2 = Caverphone(version=2) def test_caverphone2_encode(self): """Test abydos.phonetic.Caverphone (Caverphone 2).""" self.assertEqual(self.pa.encode(''), '1111111111') - self.assertEqual(self.pa.encode('', 2), '1111111111') - self.assertEqual(self.pa.encode('', version=2), '1111111111') + self.assertEqual(self.pa_2.encode(''), '1111111111') # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html self.assertEqual(self.pa.encode('Henrichsen'), 'ANRKSN1111') @@ -139,8 +140,7 @@ def test_caverphone2_encode(self): 'Tutto', ): self.assertEqual(self.pa.encode(word), 'TTA1111111') - self.assertEqual(self.pa.encode(word, 2), 'TTA1111111') - self.assertEqual(self.pa.encode(word, version=2), 'TTA1111111') + self.assertEqual(self.pa_2.encode(word), 'TTA1111111') for word in ( 'Cailean', 'Calan', @@ -226,8 +226,7 @@ def test_caverphone2_encode(self): 'Xylon', ): self.assertEqual(self.pa.encode(word), 'KLN1111111') - self.assertEqual(self.pa.encode(word, 2), 'KLN1111111') - self.assertEqual(self.pa.encode(word, version=2), 'KLN1111111') + self.assertEqual(self.pa_2.encode(word), 'KLN1111111') for word in ( 'Dan', 'Dane', @@ -300,14 +299,20 @@ def test_caverphone2_encode(self): 'Tyne', ): self.assertEqual(self.pa.encode(word), 'TN11111111') - self.assertEqual(self.pa.encode(word, 2), 'TN11111111') - self.assertEqual(self.pa.encode(word, version=2), 'TN11111111') + self.assertEqual(self.pa_2.encode(word), 'TN11111111') # etc. (for code coverage) self.assertEqual(self.pa.encode('enough'), 'ANF1111111') self.assertEqual(self.pa.encode('trough'), 'TRF1111111') self.assertEqual(self.pa.encode('gnu'), 'NA11111111') + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Henrichsen'), 'ANRKSN') + self.assertEqual(self.pa.encode_alpha('Dierdre'), 'TTA') + self.assertEqual(self.pa.encode_alpha('Mcclifferty'), 'MKLFTA') + self.assertEqual(self.pa.encode_alpha('Killen'), 'KLN') + self.assertEqual(self.pa.encode_alpha('Whittle'), 'WTA') + # Test wrapper self.assertEqual(caverphone('Maclaverty'), 'MKLFTA1111') @@ -321,12 +326,11 @@ def test_caverphone2_encode_php_testset(self): def test_caverphone1_encode(self): """Test abydos.phonetic.Caverphone (Caverphone 1).""" - self.assertEqual(self.pa.encode('', 1), '111111') - self.assertEqual(self.pa.encode('', version=1), '111111') + self.assertEqual(self.pa_1.encode(''), '111111') # http://caversham.otago.ac.nz/files/working/ctp060902.pdf - self.assertEqual(self.pa.encode('David', version=1), 'TFT111') - self.assertEqual(self.pa.encode('Whittle', version=1), 'WTL111') + self.assertEqual(self.pa_1.encode('David'), 'TFT111') + self.assertEqual(self.pa_1.encode('Whittle'), 'WTL111') def test_caversham(self): """Test using Caversham test set (SoundEx, Metaphone, & Caverphone).""" diff --git a/tests/phonetic/test_phonetic_daitch_mokotoff.py b/tests/phonetic/test_phonetic_daitch_mokotoff.py index 9cbf44631..1aab1341e 100644 --- a/tests/phonetic/test_phonetic_daitch_mokotoff.py +++ b/tests/phonetic/test_phonetic_daitch_mokotoff.py @@ -121,25 +121,41 @@ def test_daitch_mokotoff(self): # max_length bounds tests self.assertEqual( - self.pa.encode('Niall', max_length=-1), {'68' + '0' * 62} + DaitchMokotoff(max_length=-1).encode('Niall'), {'68' + '0' * 62} + ) + self.assertEqual( + DaitchMokotoff(max_length=0).encode('Niall'), {'680000'} ) - self.assertEqual(self.pa.encode('Niall', max_length=0), {'680000'}) # zero_pad tests self.assertEqual( - self.pa.encode('Niall', max_length=-1, zero_pad=False), {'68'} + DaitchMokotoff(max_length=-1, zero_pad=False).encode('Niall'), + {'68'}, + ) + self.assertEqual( + DaitchMokotoff(max_length=0, zero_pad=False).encode('Niall'), + {'68'}, ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=False), {'68'} + DaitchMokotoff(max_length=0, zero_pad=True).encode('Niall'), + {'680000'}, ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=True), {'680000'} + DaitchMokotoff(max_length=6, zero_pad=False).encode(''), {'0'} ) self.assertEqual( - self.pa.encode('', max_length=6, zero_pad=False), {'0'} + DaitchMokotoff(max_length=6, zero_pad=True).encode(''), {'000000'} + ) + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Augsburg'), {'AKSPRK'}) + self.assertEqual(self.pa.encode_alpha('Breuer'), {'PRAR'}) + self.assertEqual( + self.pa.encode_alpha('Halberstadt'), {'KLPRST', 'KLPSTT'} ) + self.assertEqual(self.pa.encode_alpha('Mannheim'), {'NNKN'}) self.assertEqual( - self.pa.encode('', max_length=6, zero_pad=True), {'000000'} + self.pa.encode_alpha('Chernowitz'), {'KRNPS', 'SRNPS'} ) # Test wrapper diff --git a/tests/phonetic/test_phonetic_davidson.py b/tests/phonetic/test_phonetic_davidson.py index 3f2e9b077..0314a7e87 100644 --- a/tests/phonetic/test_phonetic_davidson.py +++ b/tests/phonetic/test_phonetic_davidson.py @@ -39,13 +39,13 @@ class DavidsonTestCases(unittest.TestCase): test cases for abydos.phonetic.Davidson """ - pa = Davidson() + pa = Davidson(omit_fname=True) def test_davidson_encode(self): """Test abydos.phonetic.Davidson.""" # Base cases - self.assertEqual(self.pa.encode('', omit_fname=True), ' ') - self.assertEqual(self.pa.encode(''), ' .') + self.assertEqual(self.pa.encode(''), ' ') + self.assertEqual(Davidson().encode(''), ' .') # Test cases from Gadd (1988) "'Fisching fore werds': phonetic # retrieval of written text in information systems." Program, @@ -69,7 +69,7 @@ def test_davidson_encode(self): ('REECE', 'RC '), ) for word, encoding in test_cases: - self.assertEqual(self.pa.encode(word, omit_fname=True), encoding) + self.assertEqual(self.pa.encode(word), encoding) # Test wrapper self.assertEqual(davidson('WAIT', omit_fname=True), 'WT ') diff --git a/tests/phonetic/test_phonetic_dolby.py b/tests/phonetic/test_phonetic_dolby.py index 7d8997a30..588ae4f89 100644 --- a/tests/phonetic/test_phonetic_dolby.py +++ b/tests/phonetic/test_phonetic_dolby.py @@ -72,7 +72,7 @@ def test_dolby(self): ('*LVR', 'Oliveira', 'Olivera', 'Olivero'), ('*MS', 'Ames', 'Eames'), ('*NGL', 'Engel', 'Engle', 'Ingle'), - ('*NL', 'O\'Neal', 'O\'Neil', 'O\'Neill'), + ('*NL', "O'Neal", "O'Neil", "O'Neill"), ('*NRS', 'Andrews', 'Andrus'), ('*NRSN', 'Andersen', 'Anderson', 'Andreasen'), ('*NS', 'Ennis', 'Enos'), @@ -739,18 +739,24 @@ def test_dolby(self): # Additional tests to improve coverage self.assertEqual(self.pa.encode('Rune'), 'R*N') - self.assertEqual(self.pa.encode('Rune', keep_vowels=True), 'R*N*') - self.assertEqual(self.pa.encode('Rune', vowel_char=''), 'RN') - self.assertEqual(self.pa.encode('Rune', vowel_char='A'), 'RAN') - self.assertEqual(self.pa.encode('Rune', max_length=2), 'R*') - self.assertEqual(self.pa.encode('Rune', max_length=2), 'R*') - self.assertEqual(self.pa.encode('Wassermann', max_length=4), 'W*SR') + self.assertEqual(Dolby(keep_vowels=True).encode('Rune'), 'R*N*') + self.assertEqual(Dolby(vowel_char='').encode('Rune'), 'RN') + self.assertEqual(Dolby(vowel_char='A').encode('Rune'), 'RAN') + self.assertEqual(Dolby(max_length=2).encode('Rune'), 'R*') + self.assertEqual(Dolby(max_length=2).encode('Rune'), 'R*') + self.assertEqual(Dolby(max_length=4).encode('Wassermann'), 'W*SR') self.assertEqual( - self.pa.encode('Wassermanns', max_length=4, keep_vowels=True), - 'W*S*', + Dolby(max_length=4, keep_vowels=True).encode('Wassermanns'), 'W*S*' ) self.assertEqual(self.pa.encode('Wassermanns'), 'W*SRMNS') + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Rune'), 'RAN') + self.assertEqual(self.pa.encode_alpha('Weissman'), 'WASMN') + self.assertEqual(self.pa.encode_alpha('Pederson'), 'PADRSN') + self.assertEqual(self.pa.encode_alpha('Frederiksen'), 'FRADRKSN') + self.assertEqual(self.pa.encode_alpha('Bare'), 'BAR') + # Test wrapper self.assertEqual(dolby('Wassermanns'), 'W*SRMNS') diff --git a/tests/phonetic/test_phonetic_double_metaphone.py b/tests/phonetic/test_phonetic_double_metaphone.py index a093335fc..dcf797c65 100644 --- a/tests/phonetic/test_phonetic_double_metaphone.py +++ b/tests/phonetic/test_phonetic_double_metaphone.py @@ -86,6 +86,7 @@ class DoubleMetaphoneTestCases(unittest.TestCase): """ pa = DoubleMetaphone() + pa_4 = DoubleMetaphone(4) def test_double_metaphone(self): """Test abydos.phonetic.DoubleMetaphone.""" @@ -278,8 +279,20 @@ def test_double_metaphone(self): self.assertEqual(self.pa.encode('hej'), ('HJ', 'H')) # max_length bounds tests - self.assertEqual(self.pa.encode('Niall', max_length=-1), ('NL', '')) - self.assertEqual(self.pa.encode('Niall', max_length=0), ('NL', '')) + self.assertEqual( + DoubleMetaphone(max_length=-1).encode('Niall'), ('NL', '') + ) + self.assertEqual( + DoubleMetaphone(max_length=0).encode('Niall'), ('NL', '') + ) + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('maurice'), ('MRS', '')) + self.assertEqual(self.pa.encode_alpha('auto'), ('AT', '')) + self.assertEqual(self.pa.encode_alpha('maisey'), ('MS', '')) + self.assertEqual(self.pa.encode_alpha('catherine'), ('KÞRN', 'KTRN')) + self.assertEqual(self.pa.encode_alpha('geoff'), ('JF', 'KF')) + self.assertEqual(self.pa.encode_alpha('Bosworth'), ('PSRÞ', 'PSRT')) # Test wrapper self.assertEqual(double_metaphone('cambrillo'), ('KMPRL', 'KMPR')) @@ -506,7 +519,7 @@ def test_double_metaphone_surnames(self): self.assertEqual(self.pa.encode('Curtis'), ('KRTS', '')) self.assertEqual(self.pa.encode('Cutha'), ('K0', 'KT')) self.assertEqual(self.pa.encode('Cutter'), ('KTR', '')) - self.assertEqual(self.pa.encode('D\'Aubigny'), ('TPN', 'TPKN')) + self.assertEqual(self.pa.encode("D'Aubigny"), ('TPN', 'TPKN')) self.assertEqual(self.pa.encode('DAVIS'), ('TFS', '')) self.assertEqual(self.pa.encode('Dabinott'), ('TPNT', '')) self.assertEqual(self.pa.encode('Dacre'), ('TKR', '')) @@ -1249,7 +1262,7 @@ def test_double_metaphone_surnames(self): self.assertEqual(self.pa.encode('ab Wennonwen'), ('APNNN', '')) self.assertEqual(self.pa.encode('ap Llewellyn'), ('APLLN', '')) self.assertEqual(self.pa.encode('ap Lorwerth'), ('APLRR0', 'APLRRT')) - self.assertEqual(self.pa.encode('d\'Angouleme'), ('TNKLM', '')) + self.assertEqual(self.pa.encode("d'Angouleme"), ('TNKLM', '')) self.assertEqual(self.pa.encode('de Audeham'), ('TTHM', '')) self.assertEqual(self.pa.encode('de Bavant'), ('TPFNT', '')) self.assertEqual(self.pa.encode('de Beauchamp'), ('TPXMP', 'TPKMP')) @@ -1521,1228 +1534,1226 @@ def test_double_metaphone_surnames(self): def test_double_metaphone_surnames4(self): """Test abydos.phonetic.DoubleMetaphone (surname data, 4-letter).""" - self.assertEqual(self.pa.encode('', 4), ('', '')) - self.assertEqual(self.pa.encode('ALLERTON', 4), ('ALRT', '')) - self.assertEqual(self.pa.encode('Acton', 4), ('AKTN', '')) - self.assertEqual(self.pa.encode('Adams', 4), ('ATMS', '')) - self.assertEqual(self.pa.encode('Aggar', 4), ('AKR', '')) - self.assertEqual(self.pa.encode('Ahl', 4), ('AL', '')) - self.assertEqual(self.pa.encode('Aiken', 4), ('AKN', '')) - self.assertEqual(self.pa.encode('Alan', 4), ('ALN', '')) - self.assertEqual(self.pa.encode('Alcock', 4), ('ALKK', '')) - self.assertEqual(self.pa.encode('Alden', 4), ('ALTN', '')) - self.assertEqual(self.pa.encode('Aldham', 4), ('ALTM', '')) - self.assertEqual(self.pa.encode('Allen', 4), ('ALN', '')) - self.assertEqual(self.pa.encode('Allerton', 4), ('ALRT', '')) - self.assertEqual(self.pa.encode('Alsop', 4), ('ALSP', '')) - self.assertEqual(self.pa.encode('Alwein', 4), ('ALN', '')) - self.assertEqual(self.pa.encode('Ambler', 4), ('AMPL', '')) - self.assertEqual(self.pa.encode('Andevill', 4), ('ANTF', '')) - self.assertEqual(self.pa.encode('Andrews', 4), ('ANTR', '')) - self.assertEqual(self.pa.encode('Andreyco', 4), ('ANTR', '')) - self.assertEqual(self.pa.encode('Andriesse', 4), ('ANTR', '')) - self.assertEqual(self.pa.encode('Angier', 4), ('ANJ', 'ANJR')) - self.assertEqual(self.pa.encode('Annabel', 4), ('ANPL', '')) - self.assertEqual(self.pa.encode('Anne', 4), ('AN', '')) - self.assertEqual(self.pa.encode('Anstye', 4), ('ANST', '')) - self.assertEqual(self.pa.encode('Appling', 4), ('APLN', '')) - self.assertEqual(self.pa.encode('Apuke', 4), ('APK', '')) - self.assertEqual(self.pa.encode('Arnold', 4), ('ARNL', '')) - self.assertEqual(self.pa.encode('Ashby', 4), ('AXP', '')) - self.assertEqual(self.pa.encode('Astwood', 4), ('ASTT', '')) - self.assertEqual(self.pa.encode('Atkinson', 4), ('ATKN', '')) - self.assertEqual(self.pa.encode('Audley', 4), ('ATL', '')) - self.assertEqual(self.pa.encode('Austin', 4), ('ASTN', '')) - self.assertEqual(self.pa.encode('Avenal', 4), ('AFNL', '')) - self.assertEqual(self.pa.encode('Ayer', 4), ('AR', '')) - self.assertEqual(self.pa.encode('Ayot', 4), ('AT', '')) - self.assertEqual(self.pa.encode('Babbitt', 4), ('PPT', '')) - self.assertEqual(self.pa.encode('Bachelor', 4), ('PXLR', 'PKLR')) - self.assertEqual(self.pa.encode('Bachelour', 4), ('PXLR', 'PKLR')) - self.assertEqual(self.pa.encode('Bailey', 4), ('PL', '')) - self.assertEqual(self.pa.encode('Baivel', 4), ('PFL', '')) - self.assertEqual(self.pa.encode('Baker', 4), ('PKR', '')) - self.assertEqual(self.pa.encode('Baldwin', 4), ('PLTN', '')) - self.assertEqual(self.pa.encode('Balsley', 4), ('PLSL', '')) - self.assertEqual(self.pa.encode('Barber', 4), ('PRPR', '')) - self.assertEqual(self.pa.encode('Barker', 4), ('PRKR', '')) - self.assertEqual(self.pa.encode('Barlow', 4), ('PRL', 'PRLF')) - self.assertEqual(self.pa.encode('Barnard', 4), ('PRNR', '')) - self.assertEqual(self.pa.encode('Barnes', 4), ('PRNS', '')) - self.assertEqual(self.pa.encode('Barnsley', 4), ('PRNS', '')) - self.assertEqual(self.pa.encode('Barouxis', 4), ('PRKS', '')) - self.assertEqual(self.pa.encode('Bartlet', 4), ('PRTL', '')) - self.assertEqual(self.pa.encode('Basley', 4), ('PSL', '')) - self.assertEqual(self.pa.encode('Basset', 4), ('PST', '')) - self.assertEqual(self.pa.encode('Bassett', 4), ('PST', '')) - self.assertEqual(self.pa.encode('Batchlor', 4), ('PXLR', '')) - self.assertEqual(self.pa.encode('Bates', 4), ('PTS', '')) - self.assertEqual(self.pa.encode('Batson', 4), ('PTSN', '')) - self.assertEqual(self.pa.encode('Bayes', 4), ('PS', '')) - self.assertEqual(self.pa.encode('Bayley', 4), ('PL', '')) - self.assertEqual(self.pa.encode('Beale', 4), ('PL', '')) - self.assertEqual(self.pa.encode('Beauchamp', 4), ('PXMP', 'PKMP')) - self.assertEqual(self.pa.encode('Beauclerc', 4), ('PKLR', '')) - self.assertEqual(self.pa.encode('Beech', 4), ('PK', '')) - self.assertEqual(self.pa.encode('Beers', 4), ('PRS', '')) - self.assertEqual(self.pa.encode('Beke', 4), ('PK', '')) - self.assertEqual(self.pa.encode('Belcher', 4), ('PLXR', 'PLKR')) - self.assertEqual(self.pa.encode('Benjamin', 4), ('PNJM', '')) - self.assertEqual(self.pa.encode('Benningham', 4), ('PNNK', '')) - self.assertEqual(self.pa.encode('Bereford', 4), ('PRFR', '')) - self.assertEqual(self.pa.encode('Bergen', 4), ('PRJN', 'PRKN')) - self.assertEqual(self.pa.encode('Berkeley', 4), ('PRKL', '')) - self.assertEqual(self.pa.encode('Berry', 4), ('PR', '')) - self.assertEqual(self.pa.encode('Besse', 4), ('PS', '')) - self.assertEqual(self.pa.encode('Bessey', 4), ('PS', '')) - self.assertEqual(self.pa.encode('Bessiles', 4), ('PSLS', '')) - self.assertEqual(self.pa.encode('Bigelow', 4), ('PJL', 'PKLF')) - self.assertEqual(self.pa.encode('Bigg', 4), ('PK', '')) - self.assertEqual(self.pa.encode('Bigod', 4), ('PKT', '')) - self.assertEqual(self.pa.encode('Billings', 4), ('PLNK', '')) - self.assertEqual(self.pa.encode('Bimper', 4), ('PMPR', '')) - self.assertEqual(self.pa.encode('Binker', 4), ('PNKR', '')) - self.assertEqual(self.pa.encode('Birdsill', 4), ('PRTS', '')) - self.assertEqual(self.pa.encode('Bishop', 4), ('PXP', '')) - self.assertEqual(self.pa.encode('Black', 4), ('PLK', '')) - self.assertEqual(self.pa.encode('Blagge', 4), ('PLK', '')) - self.assertEqual(self.pa.encode('Blake', 4), ('PLK', '')) - self.assertEqual(self.pa.encode('Blanck', 4), ('PLNK', '')) - self.assertEqual(self.pa.encode('Bledsoe', 4), ('PLTS', '')) - self.assertEqual(self.pa.encode('Blennerhasset', 4), ('PLNR', '')) - self.assertEqual(self.pa.encode('Blessing', 4), ('PLSN', '')) - self.assertEqual(self.pa.encode('Blewett', 4), ('PLT', '')) - self.assertEqual(self.pa.encode('Bloctgoed', 4), ('PLKT', '')) - self.assertEqual(self.pa.encode('Bloetgoet', 4), ('PLTK', '')) - self.assertEqual(self.pa.encode('Bloodgood', 4), ('PLTK', '')) - self.assertEqual(self.pa.encode('Blossom', 4), ('PLSM', '')) - self.assertEqual(self.pa.encode('Blount', 4), ('PLNT', '')) - self.assertEqual(self.pa.encode('Bodine', 4), ('PTN', '')) - self.assertEqual(self.pa.encode('Bodman', 4), ('PTMN', '')) - self.assertEqual(self.pa.encode('BonCoeur', 4), ('PNKR', '')) - self.assertEqual(self.pa.encode('Bond', 4), ('PNT', '')) - self.assertEqual(self.pa.encode('Boscawen', 4), ('PSKN', '')) - self.assertEqual(self.pa.encode('Bosworth', 4), ('PSR0', 'PSRT')) - self.assertEqual(self.pa.encode('Bouchier', 4), ('PX', 'PKR')) - self.assertEqual(self.pa.encode('Bowne', 4), ('PN', '')) - self.assertEqual(self.pa.encode('Bradbury', 4), ('PRTP', '')) - self.assertEqual(self.pa.encode('Bradder', 4), ('PRTR', '')) - self.assertEqual(self.pa.encode('Bradford', 4), ('PRTF', '')) - self.assertEqual(self.pa.encode('Bradstreet', 4), ('PRTS', '')) - self.assertEqual(self.pa.encode('Braham', 4), ('PRHM', '')) - self.assertEqual(self.pa.encode('Brailsford', 4), ('PRLS', '')) - self.assertEqual(self.pa.encode('Brainard', 4), ('PRNR', '')) - self.assertEqual(self.pa.encode('Brandish', 4), ('PRNT', '')) - self.assertEqual(self.pa.encode('Braun', 4), ('PRN', '')) - self.assertEqual(self.pa.encode('Brecc', 4), ('PRK', '')) - self.assertEqual(self.pa.encode('Brent', 4), ('PRNT', '')) - self.assertEqual(self.pa.encode('Brenton', 4), ('PRNT', '')) - self.assertEqual(self.pa.encode('Briggs', 4), ('PRKS', '')) - self.assertEqual(self.pa.encode('Brigham', 4), ('PRM', '')) - self.assertEqual(self.pa.encode('Brobst', 4), ('PRPS', '')) - self.assertEqual(self.pa.encode('Brome', 4), ('PRM', '')) - self.assertEqual(self.pa.encode('Bronson', 4), ('PRNS', '')) - self.assertEqual(self.pa.encode('Brooks', 4), ('PRKS', '')) - self.assertEqual(self.pa.encode('Brouillard', 4), ('PRLR', '')) - self.assertEqual(self.pa.encode('Brown', 4), ('PRN', '')) - self.assertEqual(self.pa.encode('Browne', 4), ('PRN', '')) - self.assertEqual(self.pa.encode('Brownell', 4), ('PRNL', '')) - self.assertEqual(self.pa.encode('Bruley', 4), ('PRL', '')) - self.assertEqual(self.pa.encode('Bryant', 4), ('PRNT', '')) - self.assertEqual(self.pa.encode('Brzozowski', 4), ('PRSS', 'PRTS')) - self.assertEqual(self.pa.encode('Buide', 4), ('PT', '')) - self.assertEqual(self.pa.encode('Bulmer', 4), ('PLMR', '')) - self.assertEqual(self.pa.encode('Bunker', 4), ('PNKR', '')) - self.assertEqual(self.pa.encode('Burden', 4), ('PRTN', '')) - self.assertEqual(self.pa.encode('Burge', 4), ('PRJ', 'PRK')) - self.assertEqual(self.pa.encode('Burgoyne', 4), ('PRKN', '')) - self.assertEqual(self.pa.encode('Burke', 4), ('PRK', '')) - self.assertEqual(self.pa.encode('Burnett', 4), ('PRNT', '')) - self.assertEqual(self.pa.encode('Burpee', 4), ('PRP', '')) - self.assertEqual(self.pa.encode('Bursley', 4), ('PRSL', '')) - self.assertEqual(self.pa.encode('Burton', 4), ('PRTN', '')) - self.assertEqual(self.pa.encode('Bushnell', 4), ('PXNL', '')) - self.assertEqual(self.pa.encode('Buss', 4), ('PS', '')) - self.assertEqual(self.pa.encode('Buswell', 4), ('PSL', '')) - self.assertEqual(self.pa.encode('Butler', 4), ('PTLR', '')) - self.assertEqual(self.pa.encode('Calkin', 4), ('KLKN', '')) - self.assertEqual(self.pa.encode('Canada', 4), ('KNT', '')) - self.assertEqual(self.pa.encode('Canmore', 4), ('KNMR', '')) - self.assertEqual(self.pa.encode('Canney', 4), ('KN', '')) - self.assertEqual(self.pa.encode('Capet', 4), ('KPT', '')) - self.assertEqual(self.pa.encode('Card', 4), ('KRT', '')) - self.assertEqual(self.pa.encode('Carman', 4), ('KRMN', '')) - self.assertEqual(self.pa.encode('Carpenter', 4), ('KRPN', '')) - self.assertEqual(self.pa.encode('Cartwright', 4), ('KRTR', '')) - self.assertEqual(self.pa.encode('Casey', 4), ('KS', '')) - self.assertEqual(self.pa.encode('Catterfield', 4), ('KTRF', '')) - self.assertEqual(self.pa.encode('Ceeley', 4), ('SL', '')) - self.assertEqual(self.pa.encode('Chambers', 4), ('XMPR', '')) - self.assertEqual(self.pa.encode('Champion', 4), ('XMPN', '')) - self.assertEqual(self.pa.encode('Chapman', 4), ('XPMN', '')) - self.assertEqual(self.pa.encode('Chase', 4), ('XS', '')) - self.assertEqual(self.pa.encode('Cheney', 4), ('XN', '')) - self.assertEqual(self.pa.encode('Chetwynd', 4), ('XTNT', '')) - self.assertEqual(self.pa.encode('Chevalier', 4), ('XFL', 'XFLR')) - self.assertEqual(self.pa.encode('Chillingsworth', 4), ('XLNK', '')) - self.assertEqual(self.pa.encode('Christie', 4), ('KRST', '')) - self.assertEqual(self.pa.encode('Chubbuck', 4), ('XPK', '')) - self.assertEqual(self.pa.encode('Church', 4), ('XRX', 'XRK')) - self.assertEqual(self.pa.encode('Clark', 4), ('KLRK', '')) - self.assertEqual(self.pa.encode('Clarke', 4), ('KLRK', '')) - self.assertEqual(self.pa.encode('Cleare', 4), ('KLR', '')) - self.assertEqual(self.pa.encode('Clement', 4), ('KLMN', '')) - self.assertEqual(self.pa.encode('Clerke', 4), ('KLRK', '')) - self.assertEqual(self.pa.encode('Clibben', 4), ('KLPN', '')) - self.assertEqual(self.pa.encode('Clifford', 4), ('KLFR', '')) - self.assertEqual(self.pa.encode('Clivedon', 4), ('KLFT', '')) - self.assertEqual(self.pa.encode('Close', 4), ('KLS', '')) - self.assertEqual(self.pa.encode('Clothilde', 4), ('KL0L', 'KLTL')) - self.assertEqual(self.pa.encode('Cobb', 4), ('KP', '')) - self.assertEqual(self.pa.encode('Coburn', 4), ('KPRN', '')) - self.assertEqual(self.pa.encode('Coburne', 4), ('KPRN', '')) - self.assertEqual(self.pa.encode('Cocke', 4), ('KK', '')) - self.assertEqual(self.pa.encode('Coffin', 4), ('KFN', '')) - self.assertEqual(self.pa.encode('Coffyn', 4), ('KFN', '')) - self.assertEqual(self.pa.encode('Colborne', 4), ('KLPR', '')) - self.assertEqual(self.pa.encode('Colby', 4), ('KLP', '')) - self.assertEqual(self.pa.encode('Cole', 4), ('KL', '')) - self.assertEqual(self.pa.encode('Coleman', 4), ('KLMN', '')) - self.assertEqual(self.pa.encode('Collier', 4), ('KL', 'KLR')) - self.assertEqual(self.pa.encode('Compton', 4), ('KMPT', '')) - self.assertEqual(self.pa.encode('Cone', 4), ('KN', '')) - self.assertEqual(self.pa.encode('Cook', 4), ('KK', '')) - self.assertEqual(self.pa.encode('Cooke', 4), ('KK', '')) - self.assertEqual(self.pa.encode('Cooper', 4), ('KPR', '')) - self.assertEqual(self.pa.encode('Copperthwaite', 4), ('KPR0', 'KPRT')) - self.assertEqual(self.pa.encode('Corbet', 4), ('KRPT', '')) - self.assertEqual(self.pa.encode('Corell', 4), ('KRL', '')) - self.assertEqual(self.pa.encode('Corey', 4), ('KR', '')) - self.assertEqual(self.pa.encode('Corlies', 4), ('KRLS', '')) - self.assertEqual(self.pa.encode('Corneliszen', 4), ('KRNL', '')) - self.assertEqual(self.pa.encode('Cornelius', 4), ('KRNL', '')) - self.assertEqual(self.pa.encode('Cornwallis', 4), ('KRNL', '')) - self.assertEqual(self.pa.encode('Cosgrove', 4), ('KSKR', '')) - self.assertEqual(self.pa.encode('Count of Brionne', 4), ('KNTF', '')) - self.assertEqual(self.pa.encode('Covill', 4), ('KFL', '')) - self.assertEqual(self.pa.encode('Cowperthwaite', 4), ('KPR0', 'KPRT')) - self.assertEqual(self.pa.encode('Cowperwaite', 4), ('KPRT', '')) - self.assertEqual(self.pa.encode('Crane', 4), ('KRN', '')) - self.assertEqual(self.pa.encode('Creagmile', 4), ('KRKM', '')) - self.assertEqual(self.pa.encode('Crew', 4), ('KR', 'KRF')) - self.assertEqual(self.pa.encode('Crispin', 4), ('KRSP', '')) - self.assertEqual(self.pa.encode('Crocker', 4), ('KRKR', '')) - self.assertEqual(self.pa.encode('Crockett', 4), ('KRKT', '')) - self.assertEqual(self.pa.encode('Crosby', 4), ('KRSP', '')) - self.assertEqual(self.pa.encode('Crump', 4), ('KRMP', '')) - self.assertEqual(self.pa.encode('Cunningham', 4), ('KNNK', '')) - self.assertEqual(self.pa.encode('Curtis', 4), ('KRTS', '')) - self.assertEqual(self.pa.encode('Cutha', 4), ('K0', 'KT')) - self.assertEqual(self.pa.encode('Cutter', 4), ('KTR', '')) - self.assertEqual(self.pa.encode('D\'Aubigny', 4), ('TPN', 'TPKN')) - self.assertEqual(self.pa.encode('DAVIS', 4), ('TFS', '')) - self.assertEqual(self.pa.encode('Dabinott', 4), ('TPNT', '')) - self.assertEqual(self.pa.encode('Dacre', 4), ('TKR', '')) - self.assertEqual(self.pa.encode('Daggett', 4), ('TKT', '')) - self.assertEqual(self.pa.encode('Danvers', 4), ('TNFR', '')) - self.assertEqual(self.pa.encode('Darcy', 4), ('TRS', '')) - self.assertEqual(self.pa.encode('Davis', 4), ('TFS', '')) - self.assertEqual(self.pa.encode('Dawn', 4), ('TN', '')) - self.assertEqual(self.pa.encode('Dawson', 4), ('TSN', '')) - self.assertEqual(self.pa.encode('Day', 4), ('T', '')) - self.assertEqual(self.pa.encode('Daye', 4), ('T', '')) - self.assertEqual(self.pa.encode('DeGrenier', 4), ('TKRN', '')) - self.assertEqual(self.pa.encode('Dean', 4), ('TN', '')) - self.assertEqual(self.pa.encode('Deekindaugh', 4), ('TKNT', '')) - self.assertEqual(self.pa.encode('Dennis', 4), ('TNS', '')) - self.assertEqual(self.pa.encode('Denny', 4), ('TN', '')) - self.assertEqual(self.pa.encode('Denton', 4), ('TNTN', '')) - self.assertEqual(self.pa.encode('Desborough', 4), ('TSPR', '')) - self.assertEqual(self.pa.encode('Despenser', 4), ('TSPN', '')) - self.assertEqual(self.pa.encode('Deverill', 4), ('TFRL', '')) - self.assertEqual(self.pa.encode('Devine', 4), ('TFN', '')) - self.assertEqual(self.pa.encode('Dexter', 4), ('TKST', '')) - self.assertEqual(self.pa.encode('Dillaway', 4), ('TL', '')) - self.assertEqual(self.pa.encode('Dimmick', 4), ('TMK', '')) - self.assertEqual(self.pa.encode('Dinan', 4), ('TNN', '')) - self.assertEqual(self.pa.encode('Dix', 4), ('TKS', '')) - self.assertEqual(self.pa.encode('Doggett', 4), ('TKT', '')) - self.assertEqual(self.pa.encode('Donahue', 4), ('TNH', '')) - self.assertEqual(self.pa.encode('Dorfman', 4), ('TRFM', '')) - self.assertEqual(self.pa.encode('Dorris', 4), ('TRS', '')) - self.assertEqual(self.pa.encode('Dow', 4), ('T', 'TF')) - self.assertEqual(self.pa.encode('Downey', 4), ('TN', '')) - self.assertEqual(self.pa.encode('Downing', 4), ('TNNK', '')) - self.assertEqual(self.pa.encode('Dowsett', 4), ('TST', '')) - self.assertEqual(self.pa.encode('Duck?', 4), ('TK', '')) - self.assertEqual(self.pa.encode('Dudley', 4), ('TTL', '')) - self.assertEqual(self.pa.encode('Duffy', 4), ('TF', '')) - self.assertEqual(self.pa.encode('Dunn', 4), ('TN', '')) - self.assertEqual(self.pa.encode('Dunsterville', 4), ('TNST', '')) - self.assertEqual(self.pa.encode('Durrant', 4), ('TRNT', '')) - self.assertEqual(self.pa.encode('Durrin', 4), ('TRN', '')) - self.assertEqual(self.pa.encode('Dustin', 4), ('TSTN', '')) - self.assertEqual(self.pa.encode('Duston', 4), ('TSTN', '')) - self.assertEqual(self.pa.encode('Eames', 4), ('AMS', '')) - self.assertEqual(self.pa.encode('Early', 4), ('ARL', '')) - self.assertEqual(self.pa.encode('Easty', 4), ('AST', '')) - self.assertEqual(self.pa.encode('Ebbett', 4), ('APT', '')) - self.assertEqual(self.pa.encode('Eberbach', 4), ('APRP', '')) - self.assertEqual(self.pa.encode('Eberhard', 4), ('APRR', '')) - self.assertEqual(self.pa.encode('Eddy', 4), ('AT', '')) - self.assertEqual(self.pa.encode('Edenden', 4), ('ATNT', '')) - self.assertEqual(self.pa.encode('Edwards', 4), ('ATRT', '')) - self.assertEqual(self.pa.encode('Eglinton', 4), ('AKLN', 'ALNT')) - self.assertEqual(self.pa.encode('Eliot', 4), ('ALT', '')) - self.assertEqual(self.pa.encode('Elizabeth', 4), ('ALSP', '')) - self.assertEqual(self.pa.encode('Ellis', 4), ('ALS', '')) - self.assertEqual(self.pa.encode('Ellison', 4), ('ALSN', '')) - self.assertEqual(self.pa.encode('Ellot', 4), ('ALT', '')) - self.assertEqual(self.pa.encode('Elny', 4), ('ALN', '')) - self.assertEqual(self.pa.encode('Elsner', 4), ('ALSN', '')) - self.assertEqual(self.pa.encode('Emerson', 4), ('AMRS', '')) - self.assertEqual(self.pa.encode('Empson', 4), ('AMPS', '')) - self.assertEqual(self.pa.encode('Est', 4), ('AST', '')) - self.assertEqual(self.pa.encode('Estabrook', 4), ('ASTP', '')) - self.assertEqual(self.pa.encode('Estes', 4), ('ASTS', '')) - self.assertEqual(self.pa.encode('Estey', 4), ('AST', '')) - self.assertEqual(self.pa.encode('Evans', 4), ('AFNS', '')) - self.assertEqual(self.pa.encode('Fallowell', 4), ('FLL', '')) - self.assertEqual(self.pa.encode('Farnsworth', 4), ('FRNS', '')) - self.assertEqual(self.pa.encode('Feake', 4), ('FK', '')) - self.assertEqual(self.pa.encode('Feke', 4), ('FK', '')) - self.assertEqual(self.pa.encode('Fellows', 4), ('FLS', '')) - self.assertEqual(self.pa.encode('Fettiplace', 4), ('FTPL', '')) - self.assertEqual(self.pa.encode('Finney', 4), ('FN', '')) - self.assertEqual(self.pa.encode('Fischer', 4), ('FXR', 'FSKR')) - self.assertEqual(self.pa.encode('Fisher', 4), ('FXR', '')) - self.assertEqual(self.pa.encode('Fisk', 4), ('FSK', '')) - self.assertEqual(self.pa.encode('Fiske', 4), ('FSK', '')) - self.assertEqual(self.pa.encode('Fletcher', 4), ('FLXR', '')) - self.assertEqual(self.pa.encode('Folger', 4), ('FLKR', 'FLJR')) - self.assertEqual(self.pa.encode('Foliot', 4), ('FLT', '')) - self.assertEqual(self.pa.encode('Folyot', 4), ('FLT', '')) - self.assertEqual(self.pa.encode('Fones', 4), ('FNS', '')) - self.assertEqual(self.pa.encode('Fordham', 4), ('FRTM', '')) - self.assertEqual(self.pa.encode('Forstner', 4), ('FRST', '')) - self.assertEqual(self.pa.encode('Fosten', 4), ('FSTN', '')) - self.assertEqual(self.pa.encode('Foster', 4), ('FSTR', '')) - self.assertEqual(self.pa.encode('Foulke', 4), ('FLK', '')) - self.assertEqual(self.pa.encode('Fowler', 4), ('FLR', '')) - self.assertEqual(self.pa.encode('Foxwell', 4), ('FKSL', '')) - self.assertEqual(self.pa.encode('Fraley', 4), ('FRL', '')) - self.assertEqual(self.pa.encode('Franceys', 4), ('FRNS', '')) - self.assertEqual(self.pa.encode('Franke', 4), ('FRNK', '')) - self.assertEqual(self.pa.encode('Frascella', 4), ('FRSL', '')) - self.assertEqual(self.pa.encode('Frazer', 4), ('FRSR', '')) - self.assertEqual(self.pa.encode('Fredd', 4), ('FRT', '')) - self.assertEqual(self.pa.encode('Freeman', 4), ('FRMN', '')) - self.assertEqual(self.pa.encode('French', 4), ('FRNX', 'FRNK')) - self.assertEqual(self.pa.encode('Freville', 4), ('FRFL', '')) - self.assertEqual(self.pa.encode('Frey', 4), ('FR', '')) - self.assertEqual(self.pa.encode('Frick', 4), ('FRK', '')) - self.assertEqual(self.pa.encode('Frier', 4), ('FR', 'FRR')) - self.assertEqual(self.pa.encode('Froe', 4), ('FR', '')) - self.assertEqual(self.pa.encode('Frorer', 4), ('FRRR', '')) - self.assertEqual(self.pa.encode('Frost', 4), ('FRST', '')) - self.assertEqual(self.pa.encode('Frothingham', 4), ('FR0N', 'FRTN')) - self.assertEqual(self.pa.encode('Fry', 4), ('FR', '')) - self.assertEqual(self.pa.encode('Gaffney', 4), ('KFN', '')) - self.assertEqual(self.pa.encode('Gage', 4), ('KJ', 'KK')) - self.assertEqual(self.pa.encode('Gallion', 4), ('KLN', '')) - self.assertEqual(self.pa.encode('Gallishan', 4), ('KLXN', '')) - self.assertEqual(self.pa.encode('Gamble', 4), ('KMPL', '')) - self.assertEqual(self.pa.encode('Garbrand', 4), ('KRPR', '')) - self.assertEqual(self.pa.encode('Gardner', 4), ('KRTN', '')) - self.assertEqual(self.pa.encode('Garrett', 4), ('KRT', '')) - self.assertEqual(self.pa.encode('Gassner', 4), ('KSNR', '')) - self.assertEqual(self.pa.encode('Gater', 4), ('KTR', '')) - self.assertEqual(self.pa.encode('Gaunt', 4), ('KNT', '')) - self.assertEqual(self.pa.encode('Gayer', 4), ('KR', '')) - self.assertEqual(self.pa.encode('Gerken', 4), ('KRKN', 'JRKN')) - self.assertEqual(self.pa.encode('Gerritsen', 4), ('KRTS', 'JRTS')) - self.assertEqual(self.pa.encode('Gibbs', 4), ('KPS', 'JPS')) - self.assertEqual(self.pa.encode('Giffard', 4), ('JFRT', 'KFRT')) - self.assertEqual(self.pa.encode('Gilbert', 4), ('KLPR', 'JLPR')) - self.assertEqual(self.pa.encode('Gill', 4), ('KL', 'JL')) - self.assertEqual(self.pa.encode('Gilman', 4), ('KLMN', 'JLMN')) - self.assertEqual(self.pa.encode('Glass', 4), ('KLS', '')) - self.assertEqual(self.pa.encode('GoddardGifford', 4), ('KTRJ', '')) - self.assertEqual(self.pa.encode('Godfrey', 4), ('KTFR', '')) - self.assertEqual(self.pa.encode('Godwin', 4), ('KTN', '')) - self.assertEqual(self.pa.encode('Goodale', 4), ('KTL', '')) - self.assertEqual(self.pa.encode('Goodnow', 4), ('KTN', 'KTNF')) - self.assertEqual(self.pa.encode('Gorham', 4), ('KRM', '')) - self.assertEqual(self.pa.encode('Goseline', 4), ('KSLN', '')) - self.assertEqual(self.pa.encode('Gott', 4), ('KT', '')) - self.assertEqual(self.pa.encode('Gould', 4), ('KLT', '')) - self.assertEqual(self.pa.encode('Grafton', 4), ('KRFT', '')) - self.assertEqual(self.pa.encode('Grant', 4), ('KRNT', '')) - self.assertEqual(self.pa.encode('Gray', 4), ('KR', '')) - self.assertEqual(self.pa.encode('Green', 4), ('KRN', '')) - self.assertEqual(self.pa.encode('Griffin', 4), ('KRFN', '')) - self.assertEqual(self.pa.encode('Grill', 4), ('KRL', '')) - self.assertEqual(self.pa.encode('Grim', 4), ('KRM', '')) - self.assertEqual(self.pa.encode('Grisgonelle', 4), ('KRSK', '')) - self.assertEqual(self.pa.encode('Gross', 4), ('KRS', '')) - self.assertEqual(self.pa.encode('Guba', 4), ('KP', '')) - self.assertEqual(self.pa.encode('Gybbes', 4), ('KPS', 'JPS')) - self.assertEqual(self.pa.encode('Haburne', 4), ('HPRN', '')) - self.assertEqual(self.pa.encode('Hackburne', 4), ('HKPR', '')) - self.assertEqual(self.pa.encode('Haddon?', 4), ('HTN', '')) - self.assertEqual(self.pa.encode('Haines', 4), ('HNS', '')) - self.assertEqual(self.pa.encode('Hale', 4), ('HL', '')) - self.assertEqual(self.pa.encode('Hall', 4), ('HL', '')) - self.assertEqual(self.pa.encode('Hallet', 4), ('HLT', '')) - self.assertEqual(self.pa.encode('Hallock', 4), ('HLK', '')) - self.assertEqual(self.pa.encode('Halstead', 4), ('HLST', '')) - self.assertEqual(self.pa.encode('Hammond', 4), ('HMNT', '')) - self.assertEqual(self.pa.encode('Hance', 4), ('HNS', '')) - self.assertEqual(self.pa.encode('Handy', 4), ('HNT', '')) - self.assertEqual(self.pa.encode('Hanson', 4), ('HNSN', '')) - self.assertEqual(self.pa.encode('Harasek', 4), ('HRSK', '')) - self.assertEqual(self.pa.encode('Harcourt', 4), ('HRKR', '')) - self.assertEqual(self.pa.encode('Hardy', 4), ('HRT', '')) - self.assertEqual(self.pa.encode('Harlock', 4), ('HRLK', '')) - self.assertEqual(self.pa.encode('Harris', 4), ('HRS', '')) - self.assertEqual(self.pa.encode('Hartley', 4), ('HRTL', '')) - self.assertEqual(self.pa.encode('Harvey', 4), ('HRF', '')) - self.assertEqual(self.pa.encode('Harvie', 4), ('HRF', '')) - self.assertEqual(self.pa.encode('Harwood', 4), ('HRT', '')) - self.assertEqual(self.pa.encode('Hathaway', 4), ('H0', 'HT')) - self.assertEqual(self.pa.encode('Haukeness', 4), ('HKNS', '')) - self.assertEqual(self.pa.encode('Hawkes', 4), ('HKS', '')) - self.assertEqual(self.pa.encode('Hawkhurst', 4), ('HKRS', '')) - self.assertEqual(self.pa.encode('Hawkins', 4), ('HKNS', '')) - self.assertEqual(self.pa.encode('Hawley', 4), ('HL', '')) - self.assertEqual(self.pa.encode('Heald', 4), ('HLT', '')) - self.assertEqual(self.pa.encode('Helsdon', 4), ('HLST', '')) - self.assertEqual(self.pa.encode('Hemenway', 4), ('HMN', '')) - self.assertEqual(self.pa.encode('Hemmenway', 4), ('HMN', '')) - self.assertEqual(self.pa.encode('Henck', 4), ('HNK', '')) - self.assertEqual(self.pa.encode('Henderson', 4), ('HNTR', '')) - self.assertEqual(self.pa.encode('Hendricks', 4), ('HNTR', '')) - self.assertEqual(self.pa.encode('Hersey', 4), ('HRS', '')) - self.assertEqual(self.pa.encode('Hewes', 4), ('HS', '')) - self.assertEqual(self.pa.encode('Heyman', 4), ('HMN', '')) - self.assertEqual(self.pa.encode('Hicks', 4), ('HKS', '')) - self.assertEqual(self.pa.encode('Hidden', 4), ('HTN', '')) - self.assertEqual(self.pa.encode('Higgs', 4), ('HKS', '')) - self.assertEqual(self.pa.encode('Hill', 4), ('HL', '')) - self.assertEqual(self.pa.encode('Hills', 4), ('HLS', '')) - self.assertEqual(self.pa.encode('Hinckley', 4), ('HNKL', '')) - self.assertEqual(self.pa.encode('Hipwell', 4), ('HPL', '')) - self.assertEqual(self.pa.encode('Hobart', 4), ('HPRT', '')) - self.assertEqual(self.pa.encode('Hoben', 4), ('HPN', '')) - self.assertEqual(self.pa.encode('Hoffmann', 4), ('HFMN', '')) - self.assertEqual(self.pa.encode('Hogan', 4), ('HKN', '')) - self.assertEqual(self.pa.encode('Holmes', 4), ('HLMS', '')) - self.assertEqual(self.pa.encode('Hoo', 4), ('H', '')) - self.assertEqual(self.pa.encode('Hooker', 4), ('HKR', '')) - self.assertEqual(self.pa.encode('Hopcott', 4), ('HPKT', '')) - self.assertEqual(self.pa.encode('Hopkins', 4), ('HPKN', '')) - self.assertEqual(self.pa.encode('Hopkinson', 4), ('HPKN', '')) - self.assertEqual(self.pa.encode('Hornsey', 4), ('HRNS', '')) - self.assertEqual(self.pa.encode('Houckgeest', 4), ('HKJS', 'HKKS')) - self.assertEqual(self.pa.encode('Hough', 4), ('H', '')) - self.assertEqual(self.pa.encode('Houstin', 4), ('HSTN', '')) - self.assertEqual(self.pa.encode('How', 4), ('H', 'HF')) - self.assertEqual(self.pa.encode('Howe', 4), ('H', '')) - self.assertEqual(self.pa.encode('Howland', 4), ('HLNT', '')) - self.assertEqual(self.pa.encode('Hubner', 4), ('HPNR', '')) - self.assertEqual(self.pa.encode('Hudnut', 4), ('HTNT', '')) - self.assertEqual(self.pa.encode('Hughes', 4), ('HS', '')) - self.assertEqual(self.pa.encode('Hull', 4), ('HL', '')) - self.assertEqual(self.pa.encode('Hulme', 4), ('HLM', '')) - self.assertEqual(self.pa.encode('Hume', 4), ('HM', '')) - self.assertEqual(self.pa.encode('Hundertumark', 4), ('HNTR', '')) - self.assertEqual(self.pa.encode('Hundley', 4), ('HNTL', '')) - self.assertEqual(self.pa.encode('Hungerford', 4), ('HNKR', 'HNJR')) - self.assertEqual(self.pa.encode('Hunt', 4), ('HNT', '')) - self.assertEqual(self.pa.encode('Hurst', 4), ('HRST', '')) - self.assertEqual(self.pa.encode('Husbands', 4), ('HSPN', '')) - self.assertEqual(self.pa.encode('Hussey', 4), ('HS', '')) - self.assertEqual(self.pa.encode('Husted', 4), ('HSTT', '')) - self.assertEqual(self.pa.encode('Hutchins', 4), ('HXNS', '')) - self.assertEqual(self.pa.encode('Hutchinson', 4), ('HXNS', '')) - self.assertEqual(self.pa.encode('Huttinger', 4), ('HTNK', 'HTNJ')) - self.assertEqual(self.pa.encode('Huybertsen', 4), ('HPRT', '')) - self.assertEqual(self.pa.encode('Iddenden', 4), ('ATNT', '')) - self.assertEqual(self.pa.encode('Ingraham', 4), ('ANKR', '')) - self.assertEqual(self.pa.encode('Ives', 4), ('AFS', '')) - self.assertEqual(self.pa.encode('Jackson', 4), ('JKSN', 'AKSN')) - self.assertEqual(self.pa.encode('Jacob', 4), ('JKP', 'AKP')) - self.assertEqual(self.pa.encode('Jans', 4), ('JNS', 'ANS')) - self.assertEqual(self.pa.encode('Jenkins', 4), ('JNKN', 'ANKN')) - self.assertEqual(self.pa.encode('Jewett', 4), ('JT', 'AT')) - self.assertEqual(self.pa.encode('Jewitt', 4), ('JT', 'AT')) - self.assertEqual(self.pa.encode('Johnson', 4), ('JNSN', 'ANSN')) - self.assertEqual(self.pa.encode('Jones', 4), ('JNS', 'ANS')) - self.assertEqual(self.pa.encode('Josephine', 4), ('JSFN', 'HSFN')) - self.assertEqual(self.pa.encode('Judd', 4), ('JT', 'AT')) - self.assertEqual(self.pa.encode('June', 4), ('JN', 'AN')) - self.assertEqual(self.pa.encode('Kamarowska', 4), ('KMRS', '')) - self.assertEqual(self.pa.encode('Kay', 4), ('K', '')) - self.assertEqual(self.pa.encode('Kelley', 4), ('KL', '')) - self.assertEqual(self.pa.encode('Kelly', 4), ('KL', '')) - self.assertEqual(self.pa.encode('Keymber', 4), ('KMPR', '')) - self.assertEqual(self.pa.encode('Keynes', 4), ('KNS', '')) - self.assertEqual(self.pa.encode('Kilham', 4), ('KLM', '')) - self.assertEqual(self.pa.encode('Kim', 4), ('KM', '')) - self.assertEqual(self.pa.encode('Kimball', 4), ('KMPL', '')) - self.assertEqual(self.pa.encode('King', 4), ('KNK', '')) - self.assertEqual(self.pa.encode('Kinsey', 4), ('KNS', '')) - self.assertEqual(self.pa.encode('Kirk', 4), ('KRK', '')) - self.assertEqual(self.pa.encode('Kirton', 4), ('KRTN', '')) - self.assertEqual(self.pa.encode('Kistler', 4), ('KSTL', '')) - self.assertEqual(self.pa.encode('Kitchen', 4), ('KXN', '')) - self.assertEqual(self.pa.encode('Kitson', 4), ('KTSN', '')) - self.assertEqual(self.pa.encode('Klett', 4), ('KLT', '')) - self.assertEqual(self.pa.encode('Kline', 4), ('KLN', '')) - self.assertEqual(self.pa.encode('Knapp', 4), ('NP', '')) - self.assertEqual(self.pa.encode('Knight', 4), ('NT', '')) - self.assertEqual(self.pa.encode('Knote', 4), ('NT', '')) - self.assertEqual(self.pa.encode('Knott', 4), ('NT', '')) - self.assertEqual(self.pa.encode('Knox', 4), ('NKS', '')) - self.assertEqual(self.pa.encode('Koeller', 4), ('KLR', '')) - self.assertEqual(self.pa.encode('La Pointe', 4), ('LPNT', '')) - self.assertEqual(self.pa.encode('LaPlante', 4), ('LPLN', '')) - self.assertEqual(self.pa.encode('Laimbeer', 4), ('LMPR', '')) - self.assertEqual(self.pa.encode('Lamb', 4), ('LMP', '')) - self.assertEqual(self.pa.encode('Lambertson', 4), ('LMPR', '')) - self.assertEqual(self.pa.encode('Lancto', 4), ('LNKT', '')) - self.assertEqual(self.pa.encode('Landry', 4), ('LNTR', '')) - self.assertEqual(self.pa.encode('Lane', 4), ('LN', '')) - self.assertEqual(self.pa.encode('Langendyck', 4), ('LNJN', 'LNKN')) - self.assertEqual(self.pa.encode('Langer', 4), ('LNKR', 'LNJR')) - self.assertEqual(self.pa.encode('Langford', 4), ('LNKF', '')) - self.assertEqual(self.pa.encode('Lantersee', 4), ('LNTR', '')) - self.assertEqual(self.pa.encode('Laquer', 4), ('LKR', '')) - self.assertEqual(self.pa.encode('Larkin', 4), ('LRKN', '')) - self.assertEqual(self.pa.encode('Latham', 4), ('LTM', '')) - self.assertEqual(self.pa.encode('Lathrop', 4), ('L0RP', 'LTRP')) - self.assertEqual(self.pa.encode('Lauter', 4), ('LTR', '')) - self.assertEqual(self.pa.encode('Lawrence', 4), ('LRNS', '')) - self.assertEqual(self.pa.encode('Leach', 4), ('LK', '')) - self.assertEqual(self.pa.encode('Leager', 4), ('LKR', 'LJR')) - self.assertEqual(self.pa.encode('Learned', 4), ('LRNT', '')) - self.assertEqual(self.pa.encode('Leavitt', 4), ('LFT', '')) - self.assertEqual(self.pa.encode('Lee', 4), ('L', '')) - self.assertEqual(self.pa.encode('Leete', 4), ('LT', '')) - self.assertEqual(self.pa.encode('Leggett', 4), ('LKT', '')) - self.assertEqual(self.pa.encode('Leland', 4), ('LLNT', '')) - self.assertEqual(self.pa.encode('Leonard', 4), ('LNRT', '')) - self.assertEqual(self.pa.encode('Lester', 4), ('LSTR', '')) - self.assertEqual(self.pa.encode('Lestrange', 4), ('LSTR', '')) - self.assertEqual(self.pa.encode('Lethem', 4), ('L0M', 'LTM')) - self.assertEqual(self.pa.encode('Levine', 4), ('LFN', '')) - self.assertEqual(self.pa.encode('Lewes', 4), ('LS', '')) - self.assertEqual(self.pa.encode('Lewis', 4), ('LS', '')) - self.assertEqual(self.pa.encode('Lincoln', 4), ('LNKL', '')) - self.assertEqual(self.pa.encode('Lindsey', 4), ('LNTS', '')) - self.assertEqual(self.pa.encode('Linher', 4), ('LNR', '')) - self.assertEqual(self.pa.encode('Lippet', 4), ('LPT', '')) - self.assertEqual(self.pa.encode('Lippincott', 4), ('LPNK', '')) - self.assertEqual(self.pa.encode('Lockwood', 4), ('LKT', '')) - self.assertEqual(self.pa.encode('Loines', 4), ('LNS', '')) - self.assertEqual(self.pa.encode('Lombard', 4), ('LMPR', '')) - self.assertEqual(self.pa.encode('Long', 4), ('LNK', '')) - self.assertEqual(self.pa.encode('Longespee', 4), ('LNJS', 'LNKS')) - self.assertEqual(self.pa.encode('Look', 4), ('LK', '')) - self.assertEqual(self.pa.encode('Lounsberry', 4), ('LNSP', '')) - self.assertEqual(self.pa.encode('Lounsbury', 4), ('LNSP', '')) - self.assertEqual(self.pa.encode('Louthe', 4), ('L0', 'LT')) - self.assertEqual(self.pa.encode('Loveyne', 4), ('LFN', '')) - self.assertEqual(self.pa.encode('Lowe', 4), ('L', '')) - self.assertEqual(self.pa.encode('Ludlam', 4), ('LTLM', '')) - self.assertEqual(self.pa.encode('Lumbard', 4), ('LMPR', '')) - self.assertEqual(self.pa.encode('Lund', 4), ('LNT', '')) - self.assertEqual(self.pa.encode('Luno', 4), ('LN', '')) - self.assertEqual(self.pa.encode('Lutz', 4), ('LTS', '')) - self.assertEqual(self.pa.encode('Lydia', 4), ('LT', '')) - self.assertEqual(self.pa.encode('Lynne', 4), ('LN', '')) - self.assertEqual(self.pa.encode('Lyon', 4), ('LN', '')) - self.assertEqual(self.pa.encode('MacAlpin', 4), ('MKLP', '')) - self.assertEqual(self.pa.encode('MacBricc', 4), ('MKPR', '')) - self.assertEqual(self.pa.encode('MacCrinan', 4), ('MKRN', '')) - self.assertEqual(self.pa.encode('MacKenneth', 4), ('MKN0', 'MKNT')) - self.assertEqual(self.pa.encode('MacMael nam Bo', 4), ('MKML', '')) - self.assertEqual(self.pa.encode('MacMurchada', 4), ('MKMR', '')) - self.assertEqual(self.pa.encode('Macomber', 4), ('MKMP', '')) - self.assertEqual(self.pa.encode('Macy', 4), ('MS', '')) - self.assertEqual(self.pa.encode('Magnus', 4), ('MNS', 'MKNS')) - self.assertEqual(self.pa.encode('Mahien', 4), ('MHN', '')) - self.assertEqual(self.pa.encode('Malmains', 4), ('MLMN', '')) - self.assertEqual(self.pa.encode('Malory', 4), ('MLR', '')) - self.assertEqual(self.pa.encode('Mancinelli', 4), ('MNSN', '')) - self.assertEqual(self.pa.encode('Mancini', 4), ('MNSN', '')) - self.assertEqual(self.pa.encode('Mann', 4), ('MN', '')) - self.assertEqual(self.pa.encode('Manning', 4), ('MNNK', '')) - self.assertEqual(self.pa.encode('Manter', 4), ('MNTR', '')) - self.assertEqual(self.pa.encode('Marion', 4), ('MRN', '')) - self.assertEqual(self.pa.encode('Marley', 4), ('MRL', '')) - self.assertEqual(self.pa.encode('Marmion', 4), ('MRMN', '')) - self.assertEqual(self.pa.encode('Marquart', 4), ('MRKR', '')) - self.assertEqual(self.pa.encode('Marsh', 4), ('MRX', '')) - self.assertEqual(self.pa.encode('Marshal', 4), ('MRXL', '')) - self.assertEqual(self.pa.encode('Marshall', 4), ('MRXL', '')) - self.assertEqual(self.pa.encode('Martel', 4), ('MRTL', '')) - self.assertEqual(self.pa.encode('Martha', 4), ('MR0', 'MRT')) - self.assertEqual(self.pa.encode('Martin', 4), ('MRTN', '')) - self.assertEqual(self.pa.encode('Marturano', 4), ('MRTR', '')) - self.assertEqual(self.pa.encode('Marvin', 4), ('MRFN', '')) - self.assertEqual(self.pa.encode('Mary', 4), ('MR', '')) - self.assertEqual(self.pa.encode('Mason', 4), ('MSN', '')) - self.assertEqual(self.pa.encode('Maxwell', 4), ('MKSL', '')) - self.assertEqual(self.pa.encode('Mayhew', 4), ('MH', 'MHF')) - self.assertEqual(self.pa.encode('McAllaster', 4), ('MKLS', '')) - self.assertEqual(self.pa.encode('McAllister', 4), ('MKLS', '')) - self.assertEqual(self.pa.encode('McConnell', 4), ('MKNL', '')) - self.assertEqual(self.pa.encode('McFarland', 4), ('MKFR', '')) - self.assertEqual(self.pa.encode('McIlroy', 4), ('MSLR', '')) - self.assertEqual(self.pa.encode('McNair', 4), ('MKNR', '')) - self.assertEqual(self.pa.encode('McNair-Landry', 4), ('MKNR', '')) - self.assertEqual(self.pa.encode('McRaven', 4), ('MKRF', '')) - self.assertEqual(self.pa.encode('Mead', 4), ('MT', '')) - self.assertEqual(self.pa.encode('Meade', 4), ('MT', '')) - self.assertEqual(self.pa.encode('Meck', 4), ('MK', '')) - self.assertEqual(self.pa.encode('Melton', 4), ('MLTN', '')) - self.assertEqual(self.pa.encode('Mendenhall', 4), ('MNTN', '')) - self.assertEqual(self.pa.encode('Mering', 4), ('MRNK', '')) - self.assertEqual(self.pa.encode('Merrick', 4), ('MRK', '')) - self.assertEqual(self.pa.encode('Merry', 4), ('MR', '')) - self.assertEqual(self.pa.encode('Mighill', 4), ('ML', '')) - self.assertEqual(self.pa.encode('Miller', 4), ('MLR', '')) - self.assertEqual(self.pa.encode('Milton', 4), ('MLTN', '')) - self.assertEqual(self.pa.encode('Mohun', 4), ('MHN', '')) - self.assertEqual(self.pa.encode('Montague', 4), ('MNTK', '')) - self.assertEqual(self.pa.encode('Montboucher', 4), ('MNTP', '')) - self.assertEqual(self.pa.encode('Moore', 4), ('MR', '')) - self.assertEqual(self.pa.encode('Morrel', 4), ('MRL', '')) - self.assertEqual(self.pa.encode('Morrill', 4), ('MRL', '')) - self.assertEqual(self.pa.encode('Morris', 4), ('MRS', '')) - self.assertEqual(self.pa.encode('Morton', 4), ('MRTN', '')) - self.assertEqual(self.pa.encode('Moton', 4), ('MTN', '')) - self.assertEqual(self.pa.encode('Muir', 4), ('MR', '')) - self.assertEqual(self.pa.encode('Mulferd', 4), ('MLFR', '')) - self.assertEqual(self.pa.encode('Mullins', 4), ('MLNS', '')) - self.assertEqual(self.pa.encode('Mulso', 4), ('MLS', '')) - self.assertEqual(self.pa.encode('Munger', 4), ('MNKR', 'MNJR')) - self.assertEqual(self.pa.encode('Munt', 4), ('MNT', '')) - self.assertEqual(self.pa.encode('Murchad', 4), ('MRXT', 'MRKT')) - self.assertEqual(self.pa.encode('Murdock', 4), ('MRTK', '')) - self.assertEqual(self.pa.encode('Murray', 4), ('MR', '')) - self.assertEqual(self.pa.encode('Muskett', 4), ('MSKT', '')) - self.assertEqual(self.pa.encode('Myers', 4), ('MRS', '')) - self.assertEqual(self.pa.encode('Myrick', 4), ('MRK', '')) - self.assertEqual(self.pa.encode('NORRIS', 4), ('NRS', '')) - self.assertEqual(self.pa.encode('Nayle', 4), ('NL', '')) - self.assertEqual(self.pa.encode('Newcomb', 4), ('NKMP', '')) - self.assertEqual(self.pa.encode('Newcomb(e)', 4), ('NKMP', '')) - self.assertEqual(self.pa.encode('Newkirk', 4), ('NKRK', '')) - self.assertEqual(self.pa.encode('Newton', 4), ('NTN', '')) - self.assertEqual(self.pa.encode('Niles', 4), ('NLS', '')) - self.assertEqual(self.pa.encode('Noble', 4), ('NPL', '')) - self.assertEqual(self.pa.encode('Noel', 4), ('NL', '')) - self.assertEqual(self.pa.encode('Northend', 4), ('NR0N', 'NRTN')) - self.assertEqual(self.pa.encode('Norton', 4), ('NRTN', '')) - self.assertEqual(self.pa.encode('Nutter', 4), ('NTR', '')) - self.assertEqual(self.pa.encode('Odding', 4), ('ATNK', '')) - self.assertEqual(self.pa.encode('Odenbaugh', 4), ('ATNP', '')) - self.assertEqual(self.pa.encode('Ogborn', 4), ('AKPR', '')) - self.assertEqual(self.pa.encode('Oppenheimer', 4), ('APNM', '')) - self.assertEqual(self.pa.encode('Otis', 4), ('ATS', '')) - self.assertEqual(self.pa.encode('Oviatt', 4), ('AFT', '')) - self.assertEqual(self.pa.encode('PRUST?', 4), ('PRST', '')) - self.assertEqual(self.pa.encode('Paddock', 4), ('PTK', '')) - self.assertEqual(self.pa.encode('Page', 4), ('PJ', 'PK')) - self.assertEqual(self.pa.encode('Paine', 4), ('PN', '')) - self.assertEqual(self.pa.encode('Paist', 4), ('PST', '')) - self.assertEqual(self.pa.encode('Palmer', 4), ('PLMR', '')) - self.assertEqual(self.pa.encode('Park', 4), ('PRK', '')) - self.assertEqual(self.pa.encode('Parker', 4), ('PRKR', '')) - self.assertEqual(self.pa.encode('Parkhurst', 4), ('PRKR', '')) - self.assertEqual(self.pa.encode('Parrat', 4), ('PRT', '')) - self.assertEqual(self.pa.encode('Parsons', 4), ('PRSN', '')) - self.assertEqual(self.pa.encode('Partridge', 4), ('PRTR', '')) - self.assertEqual(self.pa.encode('Pashley', 4), ('PXL', '')) - self.assertEqual(self.pa.encode('Pasley', 4), ('PSL', '')) - self.assertEqual(self.pa.encode('Patrick', 4), ('PTRK', '')) - self.assertEqual(self.pa.encode('Pattee', 4), ('PT', '')) - self.assertEqual(self.pa.encode('Patten', 4), ('PTN', '')) - self.assertEqual(self.pa.encode('Pawley', 4), ('PL', '')) - self.assertEqual(self.pa.encode('Payne', 4), ('PN', '')) - self.assertEqual(self.pa.encode('Peabody', 4), ('PPT', '')) - self.assertEqual(self.pa.encode('Peake', 4), ('PK', '')) - self.assertEqual(self.pa.encode('Pearson', 4), ('PRSN', '')) - self.assertEqual(self.pa.encode('Peat', 4), ('PT', '')) - self.assertEqual(self.pa.encode('Pedersen', 4), ('PTRS', '')) - self.assertEqual(self.pa.encode('Percy', 4), ('PRS', '')) - self.assertEqual(self.pa.encode('Perkins', 4), ('PRKN', '')) - self.assertEqual(self.pa.encode('Perrine', 4), ('PRN', '')) - self.assertEqual(self.pa.encode('Perry', 4), ('PR', '')) - self.assertEqual(self.pa.encode('Peson', 4), ('PSN', '')) - self.assertEqual(self.pa.encode('Peterson', 4), ('PTRS', '')) - self.assertEqual(self.pa.encode('Peyton', 4), ('PTN', '')) - self.assertEqual(self.pa.encode('Phinney', 4), ('FN', '')) - self.assertEqual(self.pa.encode('Pickard', 4), ('PKRT', '')) - self.assertEqual(self.pa.encode('Pierce', 4), ('PRS', '')) - self.assertEqual(self.pa.encode('Pierrepont', 4), ('PRPN', '')) - self.assertEqual(self.pa.encode('Pike', 4), ('PK', '')) - self.assertEqual(self.pa.encode('Pinkham', 4), ('PNKM', '')) - self.assertEqual(self.pa.encode('Pitman', 4), ('PTMN', '')) - self.assertEqual(self.pa.encode('Pitt', 4), ('PT', '')) - self.assertEqual(self.pa.encode('Pitts', 4), ('PTS', '')) - self.assertEqual(self.pa.encode('Plantagenet', 4), ('PLNT', '')) - self.assertEqual(self.pa.encode('Platt', 4), ('PLT', '')) - self.assertEqual(self.pa.encode('Platts', 4), ('PLTS', '')) - self.assertEqual(self.pa.encode('Pleis', 4), ('PLS', '')) - self.assertEqual(self.pa.encode('Pleiss', 4), ('PLS', '')) - self.assertEqual(self.pa.encode('Plisko', 4), ('PLSK', '')) - self.assertEqual(self.pa.encode('Pliskovitch', 4), ('PLSK', '')) - self.assertEqual(self.pa.encode('Plum', 4), ('PLM', '')) - self.assertEqual(self.pa.encode('Plume', 4), ('PLM', '')) - self.assertEqual(self.pa.encode('Poitou', 4), ('PT', '')) - self.assertEqual(self.pa.encode('Pomeroy', 4), ('PMR', '')) - self.assertEqual(self.pa.encode('Poretiers', 4), ('PRTR', '')) - self.assertEqual(self.pa.encode('Pote', 4), ('PT', '')) - self.assertEqual(self.pa.encode('Potter', 4), ('PTR', '')) - self.assertEqual(self.pa.encode('Potts', 4), ('PTS', '')) - self.assertEqual(self.pa.encode('Powell', 4), ('PL', '')) - self.assertEqual(self.pa.encode('Pratt', 4), ('PRT', '')) - self.assertEqual(self.pa.encode('Presbury', 4), ('PRSP', '')) - self.assertEqual(self.pa.encode('Priest', 4), ('PRST', '')) - self.assertEqual(self.pa.encode('Prindle', 4), ('PRNT', '')) - self.assertEqual(self.pa.encode('Prior', 4), ('PRR', '')) - self.assertEqual(self.pa.encode('Profumo', 4), ('PRFM', '')) - self.assertEqual(self.pa.encode('Purdy', 4), ('PRT', '')) - self.assertEqual(self.pa.encode('Purefoy', 4), ('PRF', '')) - self.assertEqual(self.pa.encode('Pury', 4), ('PR', '')) - self.assertEqual(self.pa.encode('Quinter', 4), ('KNTR', '')) - self.assertEqual(self.pa.encode('Rachel', 4), ('RXL', 'RKL')) - self.assertEqual(self.pa.encode('Rand', 4), ('RNT', '')) - self.assertEqual(self.pa.encode('Rankin', 4), ('RNKN', '')) - self.assertEqual(self.pa.encode('Ravenscroft', 4), ('RFNS', '')) - self.assertEqual(self.pa.encode('Raynsford', 4), ('RNSF', '')) - self.assertEqual(self.pa.encode('Reakirt', 4), ('RKRT', '')) - self.assertEqual(self.pa.encode('Reaves', 4), ('RFS', '')) - self.assertEqual(self.pa.encode('Reeves', 4), ('RFS', '')) - self.assertEqual(self.pa.encode('Reichert', 4), ('RXRT', 'RKRT')) - self.assertEqual(self.pa.encode('Remmele', 4), ('RML', '')) - self.assertEqual(self.pa.encode('Reynolds', 4), ('RNLT', '')) - self.assertEqual(self.pa.encode('Rhodes', 4), ('RTS', '')) - self.assertEqual(self.pa.encode('Richards', 4), ('RXRT', 'RKRT')) - self.assertEqual(self.pa.encode('Richardson', 4), ('RXRT', 'RKRT')) - self.assertEqual(self.pa.encode('Ring', 4), ('RNK', '')) - self.assertEqual(self.pa.encode('Roberts', 4), ('RPRT', '')) - self.assertEqual(self.pa.encode('Robertson', 4), ('RPRT', '')) - self.assertEqual(self.pa.encode('Robson', 4), ('RPSN', '')) - self.assertEqual(self.pa.encode('Rodie', 4), ('RT', '')) - self.assertEqual(self.pa.encode('Rody', 4), ('RT', '')) - self.assertEqual(self.pa.encode('Rogers', 4), ('RKRS', 'RJRS')) - self.assertEqual(self.pa.encode('Ross', 4), ('RS', '')) - self.assertEqual(self.pa.encode('Rosslevin', 4), ('RSLF', '')) - self.assertEqual(self.pa.encode('Rowland', 4), ('RLNT', '')) - self.assertEqual(self.pa.encode('Ruehl', 4), ('RL', '')) - self.assertEqual(self.pa.encode('Russell', 4), ('RSL', '')) - self.assertEqual(self.pa.encode('Ruth', 4), ('R0', 'RT')) - self.assertEqual(self.pa.encode('Ryan', 4), ('RN', '')) - self.assertEqual(self.pa.encode('Rysse', 4), ('RS', '')) - self.assertEqual(self.pa.encode('Sadler', 4), ('STLR', '')) - self.assertEqual(self.pa.encode('Salmon', 4), ('SLMN', '')) - self.assertEqual(self.pa.encode('Salter', 4), ('SLTR', '')) - self.assertEqual(self.pa.encode('Salvatore', 4), ('SLFT', '')) - self.assertEqual(self.pa.encode('Sanders', 4), ('SNTR', '')) - self.assertEqual(self.pa.encode('Sands', 4), ('SNTS', '')) - self.assertEqual(self.pa.encode('Sanford', 4), ('SNFR', '')) - self.assertEqual(self.pa.encode('Sanger', 4), ('SNKR', 'SNJR')) - self.assertEqual(self.pa.encode('Sargent', 4), ('SRJN', 'SRKN')) - self.assertEqual(self.pa.encode('Saunders', 4), ('SNTR', '')) - self.assertEqual(self.pa.encode('Schilling', 4), ('XLNK', '')) - self.assertEqual(self.pa.encode('Schlegel', 4), ('XLKL', 'SLKL')) - self.assertEqual(self.pa.encode('Scott', 4), ('SKT', '')) - self.assertEqual(self.pa.encode('Sears', 4), ('SRS', '')) - self.assertEqual(self.pa.encode('Segersall', 4), ('SJRS', 'SKRS')) - self.assertEqual(self.pa.encode('Senecal', 4), ('SNKL', '')) - self.assertEqual(self.pa.encode('Sergeaux', 4), ('SRJ', 'SRK')) - self.assertEqual(self.pa.encode('Severance', 4), ('SFRN', '')) - self.assertEqual(self.pa.encode('Sharp', 4), ('XRP', '')) - self.assertEqual(self.pa.encode('Sharpe', 4), ('XRP', '')) - self.assertEqual(self.pa.encode('Sharply', 4), ('XRPL', '')) - self.assertEqual(self.pa.encode('Shatswell', 4), ('XTSL', '')) - self.assertEqual(self.pa.encode('Shattack', 4), ('XTK', '')) - self.assertEqual(self.pa.encode('Shattock', 4), ('XTK', '')) - self.assertEqual(self.pa.encode('Shattuck', 4), ('XTK', '')) - self.assertEqual(self.pa.encode('Shaw', 4), ('X', 'XF')) - self.assertEqual(self.pa.encode('Sheldon', 4), ('XLTN', '')) - self.assertEqual(self.pa.encode('Sherman', 4), ('XRMN', '')) - self.assertEqual(self.pa.encode('Shinn', 4), ('XN', '')) - self.assertEqual(self.pa.encode('Shirford', 4), ('XRFR', '')) - self.assertEqual(self.pa.encode('Shirley', 4), ('XRL', '')) - self.assertEqual(self.pa.encode('Shively', 4), ('XFL', '')) - self.assertEqual(self.pa.encode('Shoemaker', 4), ('XMKR', '')) - self.assertEqual(self.pa.encode('Short', 4), ('XRT', '')) - self.assertEqual(self.pa.encode('Shotwell', 4), ('XTL', '')) - self.assertEqual(self.pa.encode('Shute', 4), ('XT', '')) - self.assertEqual(self.pa.encode('Sibley', 4), ('SPL', '')) - self.assertEqual(self.pa.encode('Silver', 4), ('SLFR', '')) - self.assertEqual(self.pa.encode('Simes', 4), ('SMS', '')) - self.assertEqual(self.pa.encode('Sinken', 4), ('SNKN', '')) - self.assertEqual(self.pa.encode('Sinn', 4), ('SN', '')) - self.assertEqual(self.pa.encode('Skelton', 4), ('SKLT', '')) - self.assertEqual(self.pa.encode('Skiffe', 4), ('SKF', '')) - self.assertEqual(self.pa.encode('Skotkonung', 4), ('SKTK', '')) - self.assertEqual(self.pa.encode('Slade', 4), ('SLT', 'XLT')) - self.assertEqual(self.pa.encode('Slye', 4), ('SL', 'XL')) - self.assertEqual(self.pa.encode('Smedley', 4), ('SMTL', 'XMTL')) - self.assertEqual(self.pa.encode('Smith', 4), ('SM0', 'XMT')) - self.assertEqual(self.pa.encode('Snow', 4), ('SN', 'XNF')) - self.assertEqual(self.pa.encode('Soole', 4), ('SL', '')) - self.assertEqual(self.pa.encode('Soule', 4), ('SL', '')) - self.assertEqual(self.pa.encode('Southworth', 4), ('S0R0', 'STRT')) - self.assertEqual(self.pa.encode('Sowles', 4), ('SLS', '')) - self.assertEqual(self.pa.encode('Spalding', 4), ('SPLT', '')) - self.assertEqual(self.pa.encode('Spark', 4), ('SPRK', '')) - self.assertEqual(self.pa.encode('Spencer', 4), ('SPNS', '')) - self.assertEqual(self.pa.encode('Sperry', 4), ('SPR', '')) - self.assertEqual(self.pa.encode('Spofford', 4), ('SPFR', '')) - self.assertEqual(self.pa.encode('Spooner', 4), ('SPNR', '')) - self.assertEqual(self.pa.encode('Sprague', 4), ('SPRK', '')) - self.assertEqual(self.pa.encode('Springer', 4), ('SPRN', '')) - self.assertEqual(self.pa.encode('St. Clair', 4), ('STKL', '')) - self.assertEqual(self.pa.encode('St. Claire', 4), ('STKL', '')) - self.assertEqual(self.pa.encode('St. Leger', 4), ('STLJ', 'STLK')) - self.assertEqual(self.pa.encode('St. Omer', 4), ('STMR', '')) - self.assertEqual(self.pa.encode('Stafferton', 4), ('STFR', '')) - self.assertEqual(self.pa.encode('Stafford', 4), ('STFR', '')) - self.assertEqual(self.pa.encode('Stalham', 4), ('STLM', '')) - self.assertEqual(self.pa.encode('Stanford', 4), ('STNF', '')) - self.assertEqual(self.pa.encode('Stanton', 4), ('STNT', '')) - self.assertEqual(self.pa.encode('Star', 4), ('STR', '')) - self.assertEqual(self.pa.encode('Starbuck', 4), ('STRP', '')) - self.assertEqual(self.pa.encode('Starkey', 4), ('STRK', '')) - self.assertEqual(self.pa.encode('Starkweather', 4), ('STRK', '')) - self.assertEqual(self.pa.encode('Stearns', 4), ('STRN', '')) - self.assertEqual(self.pa.encode('Stebbins', 4), ('STPN', '')) - self.assertEqual(self.pa.encode('Steele', 4), ('STL', '')) - self.assertEqual(self.pa.encode('Stephenson', 4), ('STFN', '')) - self.assertEqual(self.pa.encode('Stevens', 4), ('STFN', '')) - self.assertEqual(self.pa.encode('Stoddard', 4), ('STTR', '')) - self.assertEqual(self.pa.encode('Stodder', 4), ('STTR', '')) - self.assertEqual(self.pa.encode('Stone', 4), ('STN', '')) - self.assertEqual(self.pa.encode('Storey', 4), ('STR', '')) - self.assertEqual(self.pa.encode('Storrada', 4), ('STRT', '')) - self.assertEqual(self.pa.encode('Story', 4), ('STR', '')) - self.assertEqual(self.pa.encode('Stoughton', 4), ('STFT', '')) - self.assertEqual(self.pa.encode('Stout', 4), ('STT', '')) - self.assertEqual(self.pa.encode('Stow', 4), ('ST', 'STF')) - self.assertEqual(self.pa.encode('Strong', 4), ('STRN', '')) - self.assertEqual(self.pa.encode('Strutt', 4), ('STRT', '')) - self.assertEqual(self.pa.encode('Stryker', 4), ('STRK', '')) - self.assertEqual(self.pa.encode('Stuckeley', 4), ('STKL', '')) - self.assertEqual(self.pa.encode('Sturges', 4), ('STRJ', 'STRK')) - self.assertEqual(self.pa.encode('Sturgess', 4), ('STRJ', 'STRK')) - self.assertEqual(self.pa.encode('Sturgis', 4), ('STRJ', 'STRK')) - self.assertEqual(self.pa.encode('Suevain', 4), ('SFN', '')) - self.assertEqual(self.pa.encode('Sulyard', 4), ('SLRT', '')) - self.assertEqual(self.pa.encode('Sutton', 4), ('STN', '')) - self.assertEqual(self.pa.encode('Swain', 4), ('SN', 'XN')) - self.assertEqual(self.pa.encode('Swayne', 4), ('SN', 'XN')) - self.assertEqual(self.pa.encode('Swayze', 4), ('SS', 'XTS')) - self.assertEqual(self.pa.encode('Swift', 4), ('SFT', 'XFT')) - self.assertEqual(self.pa.encode('Taber', 4), ('TPR', '')) - self.assertEqual(self.pa.encode('Talcott', 4), ('TLKT', '')) - self.assertEqual(self.pa.encode('Tarne', 4), ('TRN', '')) - self.assertEqual(self.pa.encode('Tatum', 4), ('TTM', '')) - self.assertEqual(self.pa.encode('Taverner', 4), ('TFRN', '')) - self.assertEqual(self.pa.encode('Taylor', 4), ('TLR', '')) - self.assertEqual(self.pa.encode('Tenney', 4), ('TN', '')) - self.assertEqual(self.pa.encode('Thayer', 4), ('0R', 'TR')) - self.assertEqual(self.pa.encode('Thember', 4), ('0MPR', 'TMPR')) - self.assertEqual(self.pa.encode('Thomas', 4), ('TMS', '')) - self.assertEqual(self.pa.encode('Thompson', 4), ('TMPS', '')) - self.assertEqual(self.pa.encode('Thorne', 4), ('0RN', 'TRN')) - self.assertEqual(self.pa.encode('Thornycraft', 4), ('0RNK', 'TRNK')) - self.assertEqual(self.pa.encode('Threlkeld', 4), ('0RLK', 'TRLK')) - self.assertEqual(self.pa.encode('Throckmorton', 4), ('0RKM', 'TRKM')) - self.assertEqual(self.pa.encode('Thwaits', 4), ('0TS', 'TTS')) - self.assertEqual(self.pa.encode('Tibbetts', 4), ('TPTS', '')) - self.assertEqual(self.pa.encode('Tidd', 4), ('TT', '')) - self.assertEqual(self.pa.encode('Tierney', 4), ('TRN', '')) - self.assertEqual(self.pa.encode('Tilley', 4), ('TL', '')) - self.assertEqual(self.pa.encode('Tillieres', 4), ('TLRS', '')) - self.assertEqual(self.pa.encode('Tilly', 4), ('TL', '')) - self.assertEqual(self.pa.encode('Tisdale', 4), ('TSTL', '')) - self.assertEqual(self.pa.encode('Titus', 4), ('TTS', '')) - self.assertEqual(self.pa.encode('Tobey', 4), ('TP', '')) - self.assertEqual(self.pa.encode('Tooker', 4), ('TKR', '')) - self.assertEqual(self.pa.encode('Towle', 4), ('TL', '')) - self.assertEqual(self.pa.encode('Towne', 4), ('TN', '')) - self.assertEqual(self.pa.encode('Townsend', 4), ('TNSN', '')) - self.assertEqual(self.pa.encode('Treadway', 4), ('TRT', '')) - self.assertEqual(self.pa.encode('Trelawney', 4), ('TRLN', '')) - self.assertEqual(self.pa.encode('Trinder', 4), ('TRNT', '')) - self.assertEqual(self.pa.encode('Tripp', 4), ('TRP', '')) - self.assertEqual(self.pa.encode('Trippe', 4), ('TRP', '')) - self.assertEqual(self.pa.encode('Trott', 4), ('TRT', '')) - self.assertEqual(self.pa.encode('True', 4), ('TR', '')) - self.assertEqual(self.pa.encode('Trussebut', 4), ('TRSP', '')) - self.assertEqual(self.pa.encode('Tucker', 4), ('TKR', '')) - self.assertEqual(self.pa.encode('Turgeon', 4), ('TRJN', 'TRKN')) - self.assertEqual(self.pa.encode('Turner', 4), ('TRNR', '')) - self.assertEqual(self.pa.encode('Tuttle', 4), ('TTL', '')) - self.assertEqual(self.pa.encode('Tyler', 4), ('TLR', '')) - self.assertEqual(self.pa.encode('Tylle', 4), ('TL', '')) - self.assertEqual(self.pa.encode('Tyrrel', 4), ('TRL', '')) - self.assertEqual(self.pa.encode('Ua Tuathail', 4), ('AT0L', 'ATTL')) - self.assertEqual(self.pa.encode('Ulrich', 4), ('ALRX', 'ALRK')) - self.assertEqual(self.pa.encode('Underhill', 4), ('ANTR', '')) - self.assertEqual(self.pa.encode('Underwood', 4), ('ANTR', '')) - self.assertEqual(self.pa.encode('Unknown', 4), ('ANKN', '')) - self.assertEqual(self.pa.encode('Valentine', 4), ('FLNT', '')) - self.assertEqual(self.pa.encode('Van Egmond', 4), ('FNKM', '')) - self.assertEqual(self.pa.encode('Van der Beek', 4), ('FNTR', '')) - self.assertEqual(self.pa.encode('Vaughan', 4), ('FKN', '')) - self.assertEqual(self.pa.encode('Vermenlen', 4), ('FRMN', '')) - self.assertEqual(self.pa.encode('Vincent', 4), ('FNSN', '')) - self.assertEqual(self.pa.encode('Volentine', 4), ('FLNT', '')) - self.assertEqual(self.pa.encode('Wagner', 4), ('AKNR', 'FKNR')) - self.assertEqual(self.pa.encode('Waite', 4), ('AT', 'FT')) - self.assertEqual(self.pa.encode('Walker', 4), ('ALKR', 'FLKR')) - self.assertEqual(self.pa.encode('Walter', 4), ('ALTR', 'FLTR')) - self.assertEqual(self.pa.encode('Wandell', 4), ('ANTL', 'FNTL')) - self.assertEqual(self.pa.encode('Wandesford', 4), ('ANTS', 'FNTS')) - self.assertEqual(self.pa.encode('Warbleton', 4), ('ARPL', 'FRPL')) - self.assertEqual(self.pa.encode('Ward', 4), ('ART', 'FRT')) - self.assertEqual(self.pa.encode('Warde', 4), ('ART', 'FRT')) - self.assertEqual(self.pa.encode('Ware', 4), ('AR', 'FR')) - self.assertEqual(self.pa.encode('Wareham', 4), ('ARHM', 'FRHM')) - self.assertEqual(self.pa.encode('Warner', 4), ('ARNR', 'FRNR')) - self.assertEqual(self.pa.encode('Warren', 4), ('ARN', 'FRN')) - self.assertEqual(self.pa.encode('Washburne', 4), ('AXPR', 'FXPR')) - self.assertEqual(self.pa.encode('Waterbury', 4), ('ATRP', 'FTRP')) - self.assertEqual(self.pa.encode('Watson', 4), ('ATSN', 'FTSN')) - self.assertEqual( - self.pa.encode('WatsonEllithorpe', 4), ('ATSN', 'FTSN') - ) - self.assertEqual(self.pa.encode('Watts', 4), ('ATS', 'FTS')) - self.assertEqual(self.pa.encode('Wayne', 4), ('AN', 'FN')) - self.assertEqual(self.pa.encode('Webb', 4), ('AP', 'FP')) - self.assertEqual(self.pa.encode('Weber', 4), ('APR', 'FPR')) - self.assertEqual(self.pa.encode('Webster', 4), ('APST', 'FPST')) - self.assertEqual(self.pa.encode('Weed', 4), ('AT', 'FT')) - self.assertEqual(self.pa.encode('Weeks', 4), ('AKS', 'FKS')) - self.assertEqual(self.pa.encode('Wells', 4), ('ALS', 'FLS')) - self.assertEqual(self.pa.encode('Wenzell', 4), ('ANSL', 'FNTS')) - self.assertEqual(self.pa.encode('West', 4), ('AST', 'FST')) - self.assertEqual(self.pa.encode('Westbury', 4), ('ASTP', 'FSTP')) - self.assertEqual(self.pa.encode('Whatlocke', 4), ('ATLK', '')) - self.assertEqual(self.pa.encode('Wheeler', 4), ('ALR', '')) - self.assertEqual(self.pa.encode('Whiston', 4), ('ASTN', '')) - self.assertEqual(self.pa.encode('White', 4), ('AT', '')) - self.assertEqual(self.pa.encode('Whitman', 4), ('ATMN', '')) - self.assertEqual(self.pa.encode('Whiton', 4), ('ATN', '')) - self.assertEqual(self.pa.encode('Whitson', 4), ('ATSN', '')) - self.assertEqual(self.pa.encode('Wickes', 4), ('AKS', 'FKS')) - self.assertEqual(self.pa.encode('Wilbur', 4), ('ALPR', 'FLPR')) - self.assertEqual(self.pa.encode('Wilcotes', 4), ('ALKT', 'FLKT')) - self.assertEqual(self.pa.encode('Wilkinson', 4), ('ALKN', 'FLKN')) - self.assertEqual(self.pa.encode('Willets', 4), ('ALTS', 'FLTS')) - self.assertEqual(self.pa.encode('Willett', 4), ('ALT', 'FLT')) - self.assertEqual(self.pa.encode('Willey', 4), ('AL', 'FL')) - self.assertEqual(self.pa.encode('Williams', 4), ('ALMS', 'FLMS')) - self.assertEqual(self.pa.encode('Williston', 4), ('ALST', 'FLST')) - self.assertEqual(self.pa.encode('Wilson', 4), ('ALSN', 'FLSN')) - self.assertEqual(self.pa.encode('Wimes', 4), ('AMS', 'FMS')) - self.assertEqual(self.pa.encode('Winch', 4), ('ANX', 'FNK')) - self.assertEqual(self.pa.encode('Winegar', 4), ('ANKR', 'FNKR')) - self.assertEqual(self.pa.encode('Wing', 4), ('ANK', 'FNK')) - self.assertEqual(self.pa.encode('Winsley', 4), ('ANSL', 'FNSL')) - self.assertEqual(self.pa.encode('Winslow', 4), ('ANSL', 'FNSL')) - self.assertEqual(self.pa.encode('Winthrop', 4), ('AN0R', 'FNTR')) - self.assertEqual(self.pa.encode('Wise', 4), ('AS', 'FS')) - self.assertEqual(self.pa.encode('Wood', 4), ('AT', 'FT')) - self.assertEqual(self.pa.encode('Woodbridge', 4), ('ATPR', 'FTPR')) - self.assertEqual(self.pa.encode('Woodward', 4), ('ATRT', 'FTRT')) - self.assertEqual(self.pa.encode('Wooley', 4), ('AL', 'FL')) - self.assertEqual(self.pa.encode('Woolley', 4), ('AL', 'FL')) - self.assertEqual(self.pa.encode('Worth', 4), ('AR0', 'FRT')) - self.assertEqual(self.pa.encode('Worthen', 4), ('AR0N', 'FRTN')) - self.assertEqual(self.pa.encode('Worthley', 4), ('AR0L', 'FRTL')) - self.assertEqual(self.pa.encode('Wright', 4), ('RT', '')) - self.assertEqual(self.pa.encode('Wyer', 4), ('AR', 'FR')) - self.assertEqual(self.pa.encode('Wyere', 4), ('AR', 'FR')) - self.assertEqual(self.pa.encode('Wynkoop', 4), ('ANKP', 'FNKP')) - self.assertEqual(self.pa.encode('Yarnall', 4), ('ARNL', '')) - self.assertEqual(self.pa.encode('Yeoman', 4), ('AMN', '')) - self.assertEqual(self.pa.encode('Yorke', 4), ('ARK', '')) - self.assertEqual(self.pa.encode('Young', 4), ('ANK', '')) - self.assertEqual(self.pa.encode('ab Wennonwen', 4), ('APNN', '')) - self.assertEqual(self.pa.encode('ap Llewellyn', 4), ('APLL', '')) - self.assertEqual(self.pa.encode('ap Lorwerth', 4), ('APLR', '')) - self.assertEqual(self.pa.encode('d\'Angouleme', 4), ('TNKL', '')) - self.assertEqual(self.pa.encode('de Audeham', 4), ('TTHM', '')) - self.assertEqual(self.pa.encode('de Bavant', 4), ('TPFN', '')) - self.assertEqual(self.pa.encode('de Beauchamp', 4), ('TPXM', 'TPKM')) - self.assertEqual(self.pa.encode('de Beaumont', 4), ('TPMN', '')) - self.assertEqual(self.pa.encode('de Bolbec', 4), ('TPLP', '')) - self.assertEqual(self.pa.encode('de Braiose', 4), ('TPRS', '')) - self.assertEqual(self.pa.encode('de Braose', 4), ('TPRS', '')) - self.assertEqual(self.pa.encode('de Briwere', 4), ('TPRR', '')) - self.assertEqual(self.pa.encode('de Cantelou', 4), ('TKNT', '')) - self.assertEqual(self.pa.encode('de Cherelton', 4), ('TXRL', 'TKRL')) - self.assertEqual(self.pa.encode('de Cherleton', 4), ('TXRL', 'TKRL')) - self.assertEqual(self.pa.encode('de Clare', 4), ('TKLR', '')) - self.assertEqual(self.pa.encode('de Claremont', 4), ('TKLR', '')) - self.assertEqual(self.pa.encode('de Clifford', 4), ('TKLF', '')) - self.assertEqual(self.pa.encode('de Colville', 4), ('TKLF', '')) - self.assertEqual(self.pa.encode('de Courtenay', 4), ('TKRT', '')) - self.assertEqual(self.pa.encode('de Fauconberg', 4), ('TFKN', '')) - self.assertEqual(self.pa.encode('de Forest', 4), ('TFRS', '')) - self.assertEqual(self.pa.encode('de Gai', 4), ('TK', '')) - self.assertEqual(self.pa.encode('de Grey', 4), ('TKR', '')) - self.assertEqual(self.pa.encode('de Guernons', 4), ('TKRN', '')) - self.assertEqual(self.pa.encode('de Haia', 4), ('T', '')) - self.assertEqual(self.pa.encode('de Harcourt', 4), ('TRKR', '')) - self.assertEqual(self.pa.encode('de Hastings', 4), ('TSTN', '')) - self.assertEqual(self.pa.encode('de Hoke', 4), ('TK', '')) - self.assertEqual(self.pa.encode('de Hooch', 4), ('TK', '')) - self.assertEqual(self.pa.encode('de Hugelville', 4), ('TJLF', 'TKLF')) - self.assertEqual(self.pa.encode('de Huntingdon', 4), ('TNTN', '')) - self.assertEqual(self.pa.encode('de Insula', 4), ('TNSL', '')) - self.assertEqual(self.pa.encode('de Keynes', 4), ('TKNS', '')) - self.assertEqual(self.pa.encode('de Lacy', 4), ('TLS', '')) - self.assertEqual(self.pa.encode('de Lexington', 4), ('TLKS', '')) - self.assertEqual(self.pa.encode('de Lusignan', 4), ('TLSN', 'TLSK')) - self.assertEqual(self.pa.encode('de Manvers', 4), ('TMNF', '')) - self.assertEqual(self.pa.encode('de Montagu', 4), ('TMNT', '')) - self.assertEqual(self.pa.encode('de Montault', 4), ('TMNT', '')) - self.assertEqual(self.pa.encode('de Montfort', 4), ('TMNT', '')) - self.assertEqual(self.pa.encode('de Mortimer', 4), ('TMRT', '')) - self.assertEqual(self.pa.encode('de Morville', 4), ('TMRF', '')) - self.assertEqual(self.pa.encode('de Morvois', 4), ('TMRF', '')) - self.assertEqual(self.pa.encode('de Neufmarche', 4), ('TNFM', '')) - self.assertEqual(self.pa.encode('de Odingsells', 4), ('TTNK', '')) - self.assertEqual(self.pa.encode('de Odyngsells', 4), ('TTNK', '')) - self.assertEqual(self.pa.encode('de Percy', 4), ('TPRS', '')) - self.assertEqual(self.pa.encode('de Pierrepont', 4), ('TPRP', '')) - self.assertEqual(self.pa.encode('de Plessetis', 4), ('TPLS', '')) - self.assertEqual(self.pa.encode('de Porhoet', 4), ('TPRT', '')) - self.assertEqual(self.pa.encode('de Prouz', 4), ('TPRS', '')) - self.assertEqual(self.pa.encode('de Quincy', 4), ('TKNS', '')) - self.assertEqual(self.pa.encode('de Ripellis', 4), ('TRPL', '')) - self.assertEqual(self.pa.encode('de Ros', 4), ('TRS', '')) - self.assertEqual(self.pa.encode('de Salisbury', 4), ('TSLS', '')) - self.assertEqual(self.pa.encode('de Sanford', 4), ('TSNF', '')) - self.assertEqual(self.pa.encode('de Somery', 4), ('TSMR', '')) - self.assertEqual(self.pa.encode('de St. Hilary', 4), ('TSTL', '')) - self.assertEqual(self.pa.encode('de St. Liz', 4), ('TSTL', '')) - self.assertEqual(self.pa.encode('de Sutton', 4), ('TSTN', '')) - self.assertEqual(self.pa.encode('de Toeni', 4), ('TTN', '')) - self.assertEqual(self.pa.encode('de Tony', 4), ('TTN', '')) - self.assertEqual(self.pa.encode('de Umfreville', 4), ('TMFR', '')) - self.assertEqual(self.pa.encode('de Valognes', 4), ('TFLN', 'TFLK')) - self.assertEqual(self.pa.encode('de Vaux', 4), ('TF', '')) - self.assertEqual(self.pa.encode('de Vere', 4), ('TFR', '')) - self.assertEqual(self.pa.encode('de Vermandois', 4), ('TFRM', '')) - self.assertEqual(self.pa.encode('de Vernon', 4), ('TFRN', '')) - self.assertEqual(self.pa.encode('de Vexin', 4), ('TFKS', '')) - self.assertEqual(self.pa.encode('de Vitre', 4), ('TFTR', '')) - self.assertEqual(self.pa.encode('de Wandesford', 4), ('TNTS', '')) - self.assertEqual(self.pa.encode('de Warenne', 4), ('TRN', '')) - self.assertEqual(self.pa.encode('de Westbury', 4), ('TSTP', '')) - self.assertEqual(self.pa.encode('di Saluzzo', 4), ('TSLS', 'TSLT')) - self.assertEqual(self.pa.encode('fitz Alan', 4), ('FTSL', '')) - self.assertEqual(self.pa.encode('fitz Geoffrey', 4), ('FTSJ', 'FTSK')) - self.assertEqual(self.pa.encode('fitz Herbert', 4), ('FTSR', '')) - self.assertEqual(self.pa.encode('fitz John', 4), ('FTSJ', '')) - self.assertEqual(self.pa.encode('fitz Patrick', 4), ('FTSP', '')) - self.assertEqual(self.pa.encode('fitz Payn', 4), ('FTSP', '')) - self.assertEqual(self.pa.encode('fitz Piers', 4), ('FTSP', '')) - self.assertEqual(self.pa.encode('fitz Randolph', 4), ('FTSR', '')) - self.assertEqual(self.pa.encode('fitz Richard', 4), ('FTSR', '')) - self.assertEqual(self.pa.encode('fitz Robert', 4), ('FTSR', '')) - self.assertEqual(self.pa.encode('fitz Roy', 4), ('FTSR', '')) - self.assertEqual(self.pa.encode('fitz Scrob', 4), ('FTSS', '')) - self.assertEqual(self.pa.encode('fitz Walter', 4), ('FTSL', '')) - self.assertEqual(self.pa.encode('fitz Warin', 4), ('FTSR', '')) - self.assertEqual(self.pa.encode('fitz Williams', 4), ('FTSL', '')) - self.assertEqual(self.pa.encode('la Zouche', 4), ('LSX', 'LSK')) - self.assertEqual(self.pa.encode('le Botiller', 4), ('LPTL', '')) - self.assertEqual(self.pa.encode('le Despenser', 4), ('LTSP', '')) - self.assertEqual(self.pa.encode('le deSpencer', 4), ('LTSP', '')) - self.assertEqual(self.pa.encode('of Allendale', 4), ('AFLN', '')) - self.assertEqual(self.pa.encode('of Angouleme', 4), ('AFNK', '')) - self.assertEqual(self.pa.encode('of Anjou', 4), ('AFNJ', '')) - self.assertEqual(self.pa.encode('of Aquitaine', 4), ('AFKT', '')) - self.assertEqual(self.pa.encode('of Aumale', 4), ('AFML', '')) - self.assertEqual(self.pa.encode('of Bavaria', 4), ('AFPF', '')) - self.assertEqual(self.pa.encode('of Boulogne', 4), ('AFPL', '')) - self.assertEqual(self.pa.encode('of Brittany', 4), ('AFPR', '')) - self.assertEqual(self.pa.encode('of Brittary', 4), ('AFPR', '')) - self.assertEqual(self.pa.encode('of Castile', 4), ('AFKS', '')) - self.assertEqual(self.pa.encode('of Chester', 4), ('AFXS', 'AFKS')) - self.assertEqual(self.pa.encode('of Clermont', 4), ('AFKL', '')) - self.assertEqual(self.pa.encode('of Cologne', 4), ('AFKL', '')) - self.assertEqual(self.pa.encode('of Dinan', 4), ('AFTN', '')) - self.assertEqual(self.pa.encode('of Dunbar', 4), ('AFTN', '')) - self.assertEqual(self.pa.encode('of England', 4), ('AFNK', '')) - self.assertEqual(self.pa.encode('of Essex', 4), ('AFSK', '')) - self.assertEqual(self.pa.encode('of Falaise', 4), ('AFFL', '')) - self.assertEqual(self.pa.encode('of Flanders', 4), ('AFFL', '')) - self.assertEqual(self.pa.encode('of Galloway', 4), ('AFKL', '')) - self.assertEqual(self.pa.encode('of Germany', 4), ('AFKR', 'AFJR')) - self.assertEqual(self.pa.encode('of Gloucester', 4), ('AFKL', '')) - self.assertEqual(self.pa.encode('of Heristal', 4), ('AFRS', '')) - self.assertEqual(self.pa.encode('of Hungary', 4), ('AFNK', '')) - self.assertEqual(self.pa.encode('of Huntington', 4), ('AFNT', '')) - self.assertEqual(self.pa.encode('of Kiev', 4), ('AFKF', '')) - self.assertEqual(self.pa.encode('of Kuno', 4), ('AFKN', '')) - self.assertEqual(self.pa.encode('of Landen', 4), ('AFLN', '')) - self.assertEqual(self.pa.encode('of Laon', 4), ('AFLN', '')) - self.assertEqual(self.pa.encode('of Leinster', 4), ('AFLN', '')) - self.assertEqual(self.pa.encode('of Lens', 4), ('AFLN', '')) - self.assertEqual(self.pa.encode('of Lorraine', 4), ('AFLR', '')) - self.assertEqual(self.pa.encode('of Louvain', 4), ('AFLF', '')) - self.assertEqual(self.pa.encode('of Mercia', 4), ('AFMR', '')) - self.assertEqual(self.pa.encode('of Metz', 4), ('AFMT', '')) - self.assertEqual(self.pa.encode('of Meulan', 4), ('AFML', '')) - self.assertEqual(self.pa.encode('of Nass', 4), ('AFNS', '')) - self.assertEqual(self.pa.encode('of Normandy', 4), ('AFNR', '')) - self.assertEqual(self.pa.encode('of Ohningen', 4), ('AFNN', '')) - self.assertEqual(self.pa.encode('of Orleans', 4), ('AFRL', '')) - self.assertEqual(self.pa.encode('of Poitou', 4), ('AFPT', '')) - self.assertEqual(self.pa.encode('of Polotzk', 4), ('AFPL', '')) - self.assertEqual(self.pa.encode('of Provence', 4), ('AFPR', '')) - self.assertEqual(self.pa.encode('of Ringelheim', 4), ('AFRN', '')) - self.assertEqual(self.pa.encode('of Salisbury', 4), ('AFSL', '')) - self.assertEqual(self.pa.encode('of Saxony', 4), ('AFSK', '')) - self.assertEqual(self.pa.encode('of Scotland', 4), ('AFSK', '')) - self.assertEqual(self.pa.encode('of Senlis', 4), ('AFSN', '')) - self.assertEqual(self.pa.encode('of Stafford', 4), ('AFST', '')) - self.assertEqual(self.pa.encode('of Swabia', 4), ('AFSP', '')) - self.assertEqual(self.pa.encode('of Tongres', 4), ('AFTN', '')) + self.assertEqual(self.pa_4.encode(''), ('', '')) + self.assertEqual(self.pa_4.encode('ALLERTON'), ('ALRT', '')) + self.assertEqual(self.pa_4.encode('Acton'), ('AKTN', '')) + self.assertEqual(self.pa_4.encode('Adams'), ('ATMS', '')) + self.assertEqual(self.pa_4.encode('Aggar'), ('AKR', '')) + self.assertEqual(self.pa_4.encode('Ahl'), ('AL', '')) + self.assertEqual(self.pa_4.encode('Aiken'), ('AKN', '')) + self.assertEqual(self.pa_4.encode('Alan'), ('ALN', '')) + self.assertEqual(self.pa_4.encode('Alcock'), ('ALKK', '')) + self.assertEqual(self.pa_4.encode('Alden'), ('ALTN', '')) + self.assertEqual(self.pa_4.encode('Aldham'), ('ALTM', '')) + self.assertEqual(self.pa_4.encode('Allen'), ('ALN', '')) + self.assertEqual(self.pa_4.encode('Allerton'), ('ALRT', '')) + self.assertEqual(self.pa_4.encode('Alsop'), ('ALSP', '')) + self.assertEqual(self.pa_4.encode('Alwein'), ('ALN', '')) + self.assertEqual(self.pa_4.encode('Ambler'), ('AMPL', '')) + self.assertEqual(self.pa_4.encode('Andevill'), ('ANTF', '')) + self.assertEqual(self.pa_4.encode('Andrews'), ('ANTR', '')) + self.assertEqual(self.pa_4.encode('Andreyco'), ('ANTR', '')) + self.assertEqual(self.pa_4.encode('Andriesse'), ('ANTR', '')) + self.assertEqual(self.pa_4.encode('Angier'), ('ANJ', 'ANJR')) + self.assertEqual(self.pa_4.encode('Annabel'), ('ANPL', '')) + self.assertEqual(self.pa_4.encode('Anne'), ('AN', '')) + self.assertEqual(self.pa_4.encode('Anstye'), ('ANST', '')) + self.assertEqual(self.pa_4.encode('Appling'), ('APLN', '')) + self.assertEqual(self.pa_4.encode('Apuke'), ('APK', '')) + self.assertEqual(self.pa_4.encode('Arnold'), ('ARNL', '')) + self.assertEqual(self.pa_4.encode('Ashby'), ('AXP', '')) + self.assertEqual(self.pa_4.encode('Astwood'), ('ASTT', '')) + self.assertEqual(self.pa_4.encode('Atkinson'), ('ATKN', '')) + self.assertEqual(self.pa_4.encode('Audley'), ('ATL', '')) + self.assertEqual(self.pa_4.encode('Austin'), ('ASTN', '')) + self.assertEqual(self.pa_4.encode('Avenal'), ('AFNL', '')) + self.assertEqual(self.pa_4.encode('Ayer'), ('AR', '')) + self.assertEqual(self.pa_4.encode('Ayot'), ('AT', '')) + self.assertEqual(self.pa_4.encode('Babbitt'), ('PPT', '')) + self.assertEqual(self.pa_4.encode('Bachelor'), ('PXLR', 'PKLR')) + self.assertEqual(self.pa_4.encode('Bachelour'), ('PXLR', 'PKLR')) + self.assertEqual(self.pa_4.encode('Bailey'), ('PL', '')) + self.assertEqual(self.pa_4.encode('Baivel'), ('PFL', '')) + self.assertEqual(self.pa_4.encode('Baker'), ('PKR', '')) + self.assertEqual(self.pa_4.encode('Baldwin'), ('PLTN', '')) + self.assertEqual(self.pa_4.encode('Balsley'), ('PLSL', '')) + self.assertEqual(self.pa_4.encode('Barber'), ('PRPR', '')) + self.assertEqual(self.pa_4.encode('Barker'), ('PRKR', '')) + self.assertEqual(self.pa_4.encode('Barlow'), ('PRL', 'PRLF')) + self.assertEqual(self.pa_4.encode('Barnard'), ('PRNR', '')) + self.assertEqual(self.pa_4.encode('Barnes'), ('PRNS', '')) + self.assertEqual(self.pa_4.encode('Barnsley'), ('PRNS', '')) + self.assertEqual(self.pa_4.encode('Barouxis'), ('PRKS', '')) + self.assertEqual(self.pa_4.encode('Bartlet'), ('PRTL', '')) + self.assertEqual(self.pa_4.encode('Basley'), ('PSL', '')) + self.assertEqual(self.pa_4.encode('Basset'), ('PST', '')) + self.assertEqual(self.pa_4.encode('Bassett'), ('PST', '')) + self.assertEqual(self.pa_4.encode('Batchlor'), ('PXLR', '')) + self.assertEqual(self.pa_4.encode('Bates'), ('PTS', '')) + self.assertEqual(self.pa_4.encode('Batson'), ('PTSN', '')) + self.assertEqual(self.pa_4.encode('Bayes'), ('PS', '')) + self.assertEqual(self.pa_4.encode('Bayley'), ('PL', '')) + self.assertEqual(self.pa_4.encode('Beale'), ('PL', '')) + self.assertEqual(self.pa_4.encode('Beauchamp'), ('PXMP', 'PKMP')) + self.assertEqual(self.pa_4.encode('Beauclerc'), ('PKLR', '')) + self.assertEqual(self.pa_4.encode('Beech'), ('PK', '')) + self.assertEqual(self.pa_4.encode('Beers'), ('PRS', '')) + self.assertEqual(self.pa_4.encode('Beke'), ('PK', '')) + self.assertEqual(self.pa_4.encode('Belcher'), ('PLXR', 'PLKR')) + self.assertEqual(self.pa_4.encode('Benjamin'), ('PNJM', '')) + self.assertEqual(self.pa_4.encode('Benningham'), ('PNNK', '')) + self.assertEqual(self.pa_4.encode('Bereford'), ('PRFR', '')) + self.assertEqual(self.pa_4.encode('Bergen'), ('PRJN', 'PRKN')) + self.assertEqual(self.pa_4.encode('Berkeley'), ('PRKL', '')) + self.assertEqual(self.pa_4.encode('Berry'), ('PR', '')) + self.assertEqual(self.pa_4.encode('Besse'), ('PS', '')) + self.assertEqual(self.pa_4.encode('Bessey'), ('PS', '')) + self.assertEqual(self.pa_4.encode('Bessiles'), ('PSLS', '')) + self.assertEqual(self.pa_4.encode('Bigelow'), ('PJL', 'PKLF')) + self.assertEqual(self.pa_4.encode('Bigg'), ('PK', '')) + self.assertEqual(self.pa_4.encode('Bigod'), ('PKT', '')) + self.assertEqual(self.pa_4.encode('Billings'), ('PLNK', '')) + self.assertEqual(self.pa_4.encode('Bimper'), ('PMPR', '')) + self.assertEqual(self.pa_4.encode('Binker'), ('PNKR', '')) + self.assertEqual(self.pa_4.encode('Birdsill'), ('PRTS', '')) + self.assertEqual(self.pa_4.encode('Bishop'), ('PXP', '')) + self.assertEqual(self.pa_4.encode('Black'), ('PLK', '')) + self.assertEqual(self.pa_4.encode('Blagge'), ('PLK', '')) + self.assertEqual(self.pa_4.encode('Blake'), ('PLK', '')) + self.assertEqual(self.pa_4.encode('Blanck'), ('PLNK', '')) + self.assertEqual(self.pa_4.encode('Bledsoe'), ('PLTS', '')) + self.assertEqual(self.pa_4.encode('Blennerhasset'), ('PLNR', '')) + self.assertEqual(self.pa_4.encode('Blessing'), ('PLSN', '')) + self.assertEqual(self.pa_4.encode('Blewett'), ('PLT', '')) + self.assertEqual(self.pa_4.encode('Bloctgoed'), ('PLKT', '')) + self.assertEqual(self.pa_4.encode('Bloetgoet'), ('PLTK', '')) + self.assertEqual(self.pa_4.encode('Bloodgood'), ('PLTK', '')) + self.assertEqual(self.pa_4.encode('Blossom'), ('PLSM', '')) + self.assertEqual(self.pa_4.encode('Blount'), ('PLNT', '')) + self.assertEqual(self.pa_4.encode('Bodine'), ('PTN', '')) + self.assertEqual(self.pa_4.encode('Bodman'), ('PTMN', '')) + self.assertEqual(self.pa_4.encode('BonCoeur'), ('PNKR', '')) + self.assertEqual(self.pa_4.encode('Bond'), ('PNT', '')) + self.assertEqual(self.pa_4.encode('Boscawen'), ('PSKN', '')) + self.assertEqual(self.pa_4.encode('Bosworth'), ('PSR0', 'PSRT')) + self.assertEqual(self.pa_4.encode('Bouchier'), ('PX', 'PKR')) + self.assertEqual(self.pa_4.encode('Bowne'), ('PN', '')) + self.assertEqual(self.pa_4.encode('Bradbury'), ('PRTP', '')) + self.assertEqual(self.pa_4.encode('Bradder'), ('PRTR', '')) + self.assertEqual(self.pa_4.encode('Bradford'), ('PRTF', '')) + self.assertEqual(self.pa_4.encode('Bradstreet'), ('PRTS', '')) + self.assertEqual(self.pa_4.encode('Braham'), ('PRHM', '')) + self.assertEqual(self.pa_4.encode('Brailsford'), ('PRLS', '')) + self.assertEqual(self.pa_4.encode('Brainard'), ('PRNR', '')) + self.assertEqual(self.pa_4.encode('Brandish'), ('PRNT', '')) + self.assertEqual(self.pa_4.encode('Braun'), ('PRN', '')) + self.assertEqual(self.pa_4.encode('Brecc'), ('PRK', '')) + self.assertEqual(self.pa_4.encode('Brent'), ('PRNT', '')) + self.assertEqual(self.pa_4.encode('Brenton'), ('PRNT', '')) + self.assertEqual(self.pa_4.encode('Briggs'), ('PRKS', '')) + self.assertEqual(self.pa_4.encode('Brigham'), ('PRM', '')) + self.assertEqual(self.pa_4.encode('Brobst'), ('PRPS', '')) + self.assertEqual(self.pa_4.encode('Brome'), ('PRM', '')) + self.assertEqual(self.pa_4.encode('Bronson'), ('PRNS', '')) + self.assertEqual(self.pa_4.encode('Brooks'), ('PRKS', '')) + self.assertEqual(self.pa_4.encode('Brouillard'), ('PRLR', '')) + self.assertEqual(self.pa_4.encode('Brown'), ('PRN', '')) + self.assertEqual(self.pa_4.encode('Browne'), ('PRN', '')) + self.assertEqual(self.pa_4.encode('Brownell'), ('PRNL', '')) + self.assertEqual(self.pa_4.encode('Bruley'), ('PRL', '')) + self.assertEqual(self.pa_4.encode('Bryant'), ('PRNT', '')) + self.assertEqual(self.pa_4.encode('Brzozowski'), ('PRSS', 'PRTS')) + self.assertEqual(self.pa_4.encode('Buide'), ('PT', '')) + self.assertEqual(self.pa_4.encode('Bulmer'), ('PLMR', '')) + self.assertEqual(self.pa_4.encode('Bunker'), ('PNKR', '')) + self.assertEqual(self.pa_4.encode('Burden'), ('PRTN', '')) + self.assertEqual(self.pa_4.encode('Burge'), ('PRJ', 'PRK')) + self.assertEqual(self.pa_4.encode('Burgoyne'), ('PRKN', '')) + self.assertEqual(self.pa_4.encode('Burke'), ('PRK', '')) + self.assertEqual(self.pa_4.encode('Burnett'), ('PRNT', '')) + self.assertEqual(self.pa_4.encode('Burpee'), ('PRP', '')) + self.assertEqual(self.pa_4.encode('Bursley'), ('PRSL', '')) + self.assertEqual(self.pa_4.encode('Burton'), ('PRTN', '')) + self.assertEqual(self.pa_4.encode('Bushnell'), ('PXNL', '')) + self.assertEqual(self.pa_4.encode('Buss'), ('PS', '')) + self.assertEqual(self.pa_4.encode('Buswell'), ('PSL', '')) + self.assertEqual(self.pa_4.encode('Butler'), ('PTLR', '')) + self.assertEqual(self.pa_4.encode('Calkin'), ('KLKN', '')) + self.assertEqual(self.pa_4.encode('Canada'), ('KNT', '')) + self.assertEqual(self.pa_4.encode('Canmore'), ('KNMR', '')) + self.assertEqual(self.pa_4.encode('Canney'), ('KN', '')) + self.assertEqual(self.pa_4.encode('Capet'), ('KPT', '')) + self.assertEqual(self.pa_4.encode('Card'), ('KRT', '')) + self.assertEqual(self.pa_4.encode('Carman'), ('KRMN', '')) + self.assertEqual(self.pa_4.encode('Carpenter'), ('KRPN', '')) + self.assertEqual(self.pa_4.encode('Cartwright'), ('KRTR', '')) + self.assertEqual(self.pa_4.encode('Casey'), ('KS', '')) + self.assertEqual(self.pa_4.encode('Catterfield'), ('KTRF', '')) + self.assertEqual(self.pa_4.encode('Ceeley'), ('SL', '')) + self.assertEqual(self.pa_4.encode('Chambers'), ('XMPR', '')) + self.assertEqual(self.pa_4.encode('Champion'), ('XMPN', '')) + self.assertEqual(self.pa_4.encode('Chapman'), ('XPMN', '')) + self.assertEqual(self.pa_4.encode('Chase'), ('XS', '')) + self.assertEqual(self.pa_4.encode('Cheney'), ('XN', '')) + self.assertEqual(self.pa_4.encode('Chetwynd'), ('XTNT', '')) + self.assertEqual(self.pa_4.encode('Chevalier'), ('XFL', 'XFLR')) + self.assertEqual(self.pa_4.encode('Chillingsworth'), ('XLNK', '')) + self.assertEqual(self.pa_4.encode('Christie'), ('KRST', '')) + self.assertEqual(self.pa_4.encode('Chubbuck'), ('XPK', '')) + self.assertEqual(self.pa_4.encode('Church'), ('XRX', 'XRK')) + self.assertEqual(self.pa_4.encode('Clark'), ('KLRK', '')) + self.assertEqual(self.pa_4.encode('Clarke'), ('KLRK', '')) + self.assertEqual(self.pa_4.encode('Cleare'), ('KLR', '')) + self.assertEqual(self.pa_4.encode('Clement'), ('KLMN', '')) + self.assertEqual(self.pa_4.encode('Clerke'), ('KLRK', '')) + self.assertEqual(self.pa_4.encode('Clibben'), ('KLPN', '')) + self.assertEqual(self.pa_4.encode('Clifford'), ('KLFR', '')) + self.assertEqual(self.pa_4.encode('Clivedon'), ('KLFT', '')) + self.assertEqual(self.pa_4.encode('Close'), ('KLS', '')) + self.assertEqual(self.pa_4.encode('Clothilde'), ('KL0L', 'KLTL')) + self.assertEqual(self.pa_4.encode('Cobb'), ('KP', '')) + self.assertEqual(self.pa_4.encode('Coburn'), ('KPRN', '')) + self.assertEqual(self.pa_4.encode('Coburne'), ('KPRN', '')) + self.assertEqual(self.pa_4.encode('Cocke'), ('KK', '')) + self.assertEqual(self.pa_4.encode('Coffin'), ('KFN', '')) + self.assertEqual(self.pa_4.encode('Coffyn'), ('KFN', '')) + self.assertEqual(self.pa_4.encode('Colborne'), ('KLPR', '')) + self.assertEqual(self.pa_4.encode('Colby'), ('KLP', '')) + self.assertEqual(self.pa_4.encode('Cole'), ('KL', '')) + self.assertEqual(self.pa_4.encode('Coleman'), ('KLMN', '')) + self.assertEqual(self.pa_4.encode('Collier'), ('KL', 'KLR')) + self.assertEqual(self.pa_4.encode('Compton'), ('KMPT', '')) + self.assertEqual(self.pa_4.encode('Cone'), ('KN', '')) + self.assertEqual(self.pa_4.encode('Cook'), ('KK', '')) + self.assertEqual(self.pa_4.encode('Cooke'), ('KK', '')) + self.assertEqual(self.pa_4.encode('Cooper'), ('KPR', '')) + self.assertEqual(self.pa_4.encode('Copperthwaite'), ('KPR0', 'KPRT')) + self.assertEqual(self.pa_4.encode('Corbet'), ('KRPT', '')) + self.assertEqual(self.pa_4.encode('Corell'), ('KRL', '')) + self.assertEqual(self.pa_4.encode('Corey'), ('KR', '')) + self.assertEqual(self.pa_4.encode('Corlies'), ('KRLS', '')) + self.assertEqual(self.pa_4.encode('Corneliszen'), ('KRNL', '')) + self.assertEqual(self.pa_4.encode('Cornelius'), ('KRNL', '')) + self.assertEqual(self.pa_4.encode('Cornwallis'), ('KRNL', '')) + self.assertEqual(self.pa_4.encode('Cosgrove'), ('KSKR', '')) + self.assertEqual(self.pa_4.encode('Count of Brionne'), ('KNTF', '')) + self.assertEqual(self.pa_4.encode('Covill'), ('KFL', '')) + self.assertEqual(self.pa_4.encode('Cowperthwaite'), ('KPR0', 'KPRT')) + self.assertEqual(self.pa_4.encode('Cowperwaite'), ('KPRT', '')) + self.assertEqual(self.pa_4.encode('Crane'), ('KRN', '')) + self.assertEqual(self.pa_4.encode('Creagmile'), ('KRKM', '')) + self.assertEqual(self.pa_4.encode('Crew'), ('KR', 'KRF')) + self.assertEqual(self.pa_4.encode('Crispin'), ('KRSP', '')) + self.assertEqual(self.pa_4.encode('Crocker'), ('KRKR', '')) + self.assertEqual(self.pa_4.encode('Crockett'), ('KRKT', '')) + self.assertEqual(self.pa_4.encode('Crosby'), ('KRSP', '')) + self.assertEqual(self.pa_4.encode('Crump'), ('KRMP', '')) + self.assertEqual(self.pa_4.encode('Cunningham'), ('KNNK', '')) + self.assertEqual(self.pa_4.encode('Curtis'), ('KRTS', '')) + self.assertEqual(self.pa_4.encode('Cutha'), ('K0', 'KT')) + self.assertEqual(self.pa_4.encode('Cutter'), ('KTR', '')) + self.assertEqual(self.pa_4.encode("D'Aubigny"), ('TPN', 'TPKN')) + self.assertEqual(self.pa_4.encode('DAVIS'), ('TFS', '')) + self.assertEqual(self.pa_4.encode('Dabinott'), ('TPNT', '')) + self.assertEqual(self.pa_4.encode('Dacre'), ('TKR', '')) + self.assertEqual(self.pa_4.encode('Daggett'), ('TKT', '')) + self.assertEqual(self.pa_4.encode('Danvers'), ('TNFR', '')) + self.assertEqual(self.pa_4.encode('Darcy'), ('TRS', '')) + self.assertEqual(self.pa_4.encode('Davis'), ('TFS', '')) + self.assertEqual(self.pa_4.encode('Dawn'), ('TN', '')) + self.assertEqual(self.pa_4.encode('Dawson'), ('TSN', '')) + self.assertEqual(self.pa_4.encode('Day'), ('T', '')) + self.assertEqual(self.pa_4.encode('Daye'), ('T', '')) + self.assertEqual(self.pa_4.encode('DeGrenier'), ('TKRN', '')) + self.assertEqual(self.pa_4.encode('Dean'), ('TN', '')) + self.assertEqual(self.pa_4.encode('Deekindaugh'), ('TKNT', '')) + self.assertEqual(self.pa_4.encode('Dennis'), ('TNS', '')) + self.assertEqual(self.pa_4.encode('Denny'), ('TN', '')) + self.assertEqual(self.pa_4.encode('Denton'), ('TNTN', '')) + self.assertEqual(self.pa_4.encode('Desborough'), ('TSPR', '')) + self.assertEqual(self.pa_4.encode('Despenser'), ('TSPN', '')) + self.assertEqual(self.pa_4.encode('Deverill'), ('TFRL', '')) + self.assertEqual(self.pa_4.encode('Devine'), ('TFN', '')) + self.assertEqual(self.pa_4.encode('Dexter'), ('TKST', '')) + self.assertEqual(self.pa_4.encode('Dillaway'), ('TL', '')) + self.assertEqual(self.pa_4.encode('Dimmick'), ('TMK', '')) + self.assertEqual(self.pa_4.encode('Dinan'), ('TNN', '')) + self.assertEqual(self.pa_4.encode('Dix'), ('TKS', '')) + self.assertEqual(self.pa_4.encode('Doggett'), ('TKT', '')) + self.assertEqual(self.pa_4.encode('Donahue'), ('TNH', '')) + self.assertEqual(self.pa_4.encode('Dorfman'), ('TRFM', '')) + self.assertEqual(self.pa_4.encode('Dorris'), ('TRS', '')) + self.assertEqual(self.pa_4.encode('Dow'), ('T', 'TF')) + self.assertEqual(self.pa_4.encode('Downey'), ('TN', '')) + self.assertEqual(self.pa_4.encode('Downing'), ('TNNK', '')) + self.assertEqual(self.pa_4.encode('Dowsett'), ('TST', '')) + self.assertEqual(self.pa_4.encode('Duck?'), ('TK', '')) + self.assertEqual(self.pa_4.encode('Dudley'), ('TTL', '')) + self.assertEqual(self.pa_4.encode('Duffy'), ('TF', '')) + self.assertEqual(self.pa_4.encode('Dunn'), ('TN', '')) + self.assertEqual(self.pa_4.encode('Dunsterville'), ('TNST', '')) + self.assertEqual(self.pa_4.encode('Durrant'), ('TRNT', '')) + self.assertEqual(self.pa_4.encode('Durrin'), ('TRN', '')) + self.assertEqual(self.pa_4.encode('Dustin'), ('TSTN', '')) + self.assertEqual(self.pa_4.encode('Duston'), ('TSTN', '')) + self.assertEqual(self.pa_4.encode('Eames'), ('AMS', '')) + self.assertEqual(self.pa_4.encode('Early'), ('ARL', '')) + self.assertEqual(self.pa_4.encode('Easty'), ('AST', '')) + self.assertEqual(self.pa_4.encode('Ebbett'), ('APT', '')) + self.assertEqual(self.pa_4.encode('Eberbach'), ('APRP', '')) + self.assertEqual(self.pa_4.encode('Eberhard'), ('APRR', '')) + self.assertEqual(self.pa_4.encode('Eddy'), ('AT', '')) + self.assertEqual(self.pa_4.encode('Edenden'), ('ATNT', '')) + self.assertEqual(self.pa_4.encode('Edwards'), ('ATRT', '')) + self.assertEqual(self.pa_4.encode('Eglinton'), ('AKLN', 'ALNT')) + self.assertEqual(self.pa_4.encode('Eliot'), ('ALT', '')) + self.assertEqual(self.pa_4.encode('Elizabeth'), ('ALSP', '')) + self.assertEqual(self.pa_4.encode('Ellis'), ('ALS', '')) + self.assertEqual(self.pa_4.encode('Ellison'), ('ALSN', '')) + self.assertEqual(self.pa_4.encode('Ellot'), ('ALT', '')) + self.assertEqual(self.pa_4.encode('Elny'), ('ALN', '')) + self.assertEqual(self.pa_4.encode('Elsner'), ('ALSN', '')) + self.assertEqual(self.pa_4.encode('Emerson'), ('AMRS', '')) + self.assertEqual(self.pa_4.encode('Empson'), ('AMPS', '')) + self.assertEqual(self.pa_4.encode('Est'), ('AST', '')) + self.assertEqual(self.pa_4.encode('Estabrook'), ('ASTP', '')) + self.assertEqual(self.pa_4.encode('Estes'), ('ASTS', '')) + self.assertEqual(self.pa_4.encode('Estey'), ('AST', '')) + self.assertEqual(self.pa_4.encode('Evans'), ('AFNS', '')) + self.assertEqual(self.pa_4.encode('Fallowell'), ('FLL', '')) + self.assertEqual(self.pa_4.encode('Farnsworth'), ('FRNS', '')) + self.assertEqual(self.pa_4.encode('Feake'), ('FK', '')) + self.assertEqual(self.pa_4.encode('Feke'), ('FK', '')) + self.assertEqual(self.pa_4.encode('Fellows'), ('FLS', '')) + self.assertEqual(self.pa_4.encode('Fettiplace'), ('FTPL', '')) + self.assertEqual(self.pa_4.encode('Finney'), ('FN', '')) + self.assertEqual(self.pa_4.encode('Fischer'), ('FXR', 'FSKR')) + self.assertEqual(self.pa_4.encode('Fisher'), ('FXR', '')) + self.assertEqual(self.pa_4.encode('Fisk'), ('FSK', '')) + self.assertEqual(self.pa_4.encode('Fiske'), ('FSK', '')) + self.assertEqual(self.pa_4.encode('Fletcher'), ('FLXR', '')) + self.assertEqual(self.pa_4.encode('Folger'), ('FLKR', 'FLJR')) + self.assertEqual(self.pa_4.encode('Foliot'), ('FLT', '')) + self.assertEqual(self.pa_4.encode('Folyot'), ('FLT', '')) + self.assertEqual(self.pa_4.encode('Fones'), ('FNS', '')) + self.assertEqual(self.pa_4.encode('Fordham'), ('FRTM', '')) + self.assertEqual(self.pa_4.encode('Forstner'), ('FRST', '')) + self.assertEqual(self.pa_4.encode('Fosten'), ('FSTN', '')) + self.assertEqual(self.pa_4.encode('Foster'), ('FSTR', '')) + self.assertEqual(self.pa_4.encode('Foulke'), ('FLK', '')) + self.assertEqual(self.pa_4.encode('Fowler'), ('FLR', '')) + self.assertEqual(self.pa_4.encode('Foxwell'), ('FKSL', '')) + self.assertEqual(self.pa_4.encode('Fraley'), ('FRL', '')) + self.assertEqual(self.pa_4.encode('Franceys'), ('FRNS', '')) + self.assertEqual(self.pa_4.encode('Franke'), ('FRNK', '')) + self.assertEqual(self.pa_4.encode('Frascella'), ('FRSL', '')) + self.assertEqual(self.pa_4.encode('Frazer'), ('FRSR', '')) + self.assertEqual(self.pa_4.encode('Fredd'), ('FRT', '')) + self.assertEqual(self.pa_4.encode('Freeman'), ('FRMN', '')) + self.assertEqual(self.pa_4.encode('French'), ('FRNX', 'FRNK')) + self.assertEqual(self.pa_4.encode('Freville'), ('FRFL', '')) + self.assertEqual(self.pa_4.encode('Frey'), ('FR', '')) + self.assertEqual(self.pa_4.encode('Frick'), ('FRK', '')) + self.assertEqual(self.pa_4.encode('Frier'), ('FR', 'FRR')) + self.assertEqual(self.pa_4.encode('Froe'), ('FR', '')) + self.assertEqual(self.pa_4.encode('Frorer'), ('FRRR', '')) + self.assertEqual(self.pa_4.encode('Frost'), ('FRST', '')) + self.assertEqual(self.pa_4.encode('Frothingham'), ('FR0N', 'FRTN')) + self.assertEqual(self.pa_4.encode('Fry'), ('FR', '')) + self.assertEqual(self.pa_4.encode('Gaffney'), ('KFN', '')) + self.assertEqual(self.pa_4.encode('Gage'), ('KJ', 'KK')) + self.assertEqual(self.pa_4.encode('Gallion'), ('KLN', '')) + self.assertEqual(self.pa_4.encode('Gallishan'), ('KLXN', '')) + self.assertEqual(self.pa_4.encode('Gamble'), ('KMPL', '')) + self.assertEqual(self.pa_4.encode('Garbrand'), ('KRPR', '')) + self.assertEqual(self.pa_4.encode('Gardner'), ('KRTN', '')) + self.assertEqual(self.pa_4.encode('Garrett'), ('KRT', '')) + self.assertEqual(self.pa_4.encode('Gassner'), ('KSNR', '')) + self.assertEqual(self.pa_4.encode('Gater'), ('KTR', '')) + self.assertEqual(self.pa_4.encode('Gaunt'), ('KNT', '')) + self.assertEqual(self.pa_4.encode('Gayer'), ('KR', '')) + self.assertEqual(self.pa_4.encode('Gerken'), ('KRKN', 'JRKN')) + self.assertEqual(self.pa_4.encode('Gerritsen'), ('KRTS', 'JRTS')) + self.assertEqual(self.pa_4.encode('Gibbs'), ('KPS', 'JPS')) + self.assertEqual(self.pa_4.encode('Giffard'), ('JFRT', 'KFRT')) + self.assertEqual(self.pa_4.encode('Gilbert'), ('KLPR', 'JLPR')) + self.assertEqual(self.pa_4.encode('Gill'), ('KL', 'JL')) + self.assertEqual(self.pa_4.encode('Gilman'), ('KLMN', 'JLMN')) + self.assertEqual(self.pa_4.encode('Glass'), ('KLS', '')) + self.assertEqual(self.pa_4.encode('GoddardGifford'), ('KTRJ', '')) + self.assertEqual(self.pa_4.encode('Godfrey'), ('KTFR', '')) + self.assertEqual(self.pa_4.encode('Godwin'), ('KTN', '')) + self.assertEqual(self.pa_4.encode('Goodale'), ('KTL', '')) + self.assertEqual(self.pa_4.encode('Goodnow'), ('KTN', 'KTNF')) + self.assertEqual(self.pa_4.encode('Gorham'), ('KRM', '')) + self.assertEqual(self.pa_4.encode('Goseline'), ('KSLN', '')) + self.assertEqual(self.pa_4.encode('Gott'), ('KT', '')) + self.assertEqual(self.pa_4.encode('Gould'), ('KLT', '')) + self.assertEqual(self.pa_4.encode('Grafton'), ('KRFT', '')) + self.assertEqual(self.pa_4.encode('Grant'), ('KRNT', '')) + self.assertEqual(self.pa_4.encode('Gray'), ('KR', '')) + self.assertEqual(self.pa_4.encode('Green'), ('KRN', '')) + self.assertEqual(self.pa_4.encode('Griffin'), ('KRFN', '')) + self.assertEqual(self.pa_4.encode('Grill'), ('KRL', '')) + self.assertEqual(self.pa_4.encode('Grim'), ('KRM', '')) + self.assertEqual(self.pa_4.encode('Grisgonelle'), ('KRSK', '')) + self.assertEqual(self.pa_4.encode('Gross'), ('KRS', '')) + self.assertEqual(self.pa_4.encode('Guba'), ('KP', '')) + self.assertEqual(self.pa_4.encode('Gybbes'), ('KPS', 'JPS')) + self.assertEqual(self.pa_4.encode('Haburne'), ('HPRN', '')) + self.assertEqual(self.pa_4.encode('Hackburne'), ('HKPR', '')) + self.assertEqual(self.pa_4.encode('Haddon?'), ('HTN', '')) + self.assertEqual(self.pa_4.encode('Haines'), ('HNS', '')) + self.assertEqual(self.pa_4.encode('Hale'), ('HL', '')) + self.assertEqual(self.pa_4.encode('Hall'), ('HL', '')) + self.assertEqual(self.pa_4.encode('Hallet'), ('HLT', '')) + self.assertEqual(self.pa_4.encode('Hallock'), ('HLK', '')) + self.assertEqual(self.pa_4.encode('Halstead'), ('HLST', '')) + self.assertEqual(self.pa_4.encode('Hammond'), ('HMNT', '')) + self.assertEqual(self.pa_4.encode('Hance'), ('HNS', '')) + self.assertEqual(self.pa_4.encode('Handy'), ('HNT', '')) + self.assertEqual(self.pa_4.encode('Hanson'), ('HNSN', '')) + self.assertEqual(self.pa_4.encode('Harasek'), ('HRSK', '')) + self.assertEqual(self.pa_4.encode('Harcourt'), ('HRKR', '')) + self.assertEqual(self.pa_4.encode('Hardy'), ('HRT', '')) + self.assertEqual(self.pa_4.encode('Harlock'), ('HRLK', '')) + self.assertEqual(self.pa_4.encode('Harris'), ('HRS', '')) + self.assertEqual(self.pa_4.encode('Hartley'), ('HRTL', '')) + self.assertEqual(self.pa_4.encode('Harvey'), ('HRF', '')) + self.assertEqual(self.pa_4.encode('Harvie'), ('HRF', '')) + self.assertEqual(self.pa_4.encode('Harwood'), ('HRT', '')) + self.assertEqual(self.pa_4.encode('Hathaway'), ('H0', 'HT')) + self.assertEqual(self.pa_4.encode('Haukeness'), ('HKNS', '')) + self.assertEqual(self.pa_4.encode('Hawkes'), ('HKS', '')) + self.assertEqual(self.pa_4.encode('Hawkhurst'), ('HKRS', '')) + self.assertEqual(self.pa_4.encode('Hawkins'), ('HKNS', '')) + self.assertEqual(self.pa_4.encode('Hawley'), ('HL', '')) + self.assertEqual(self.pa_4.encode('Heald'), ('HLT', '')) + self.assertEqual(self.pa_4.encode('Helsdon'), ('HLST', '')) + self.assertEqual(self.pa_4.encode('Hemenway'), ('HMN', '')) + self.assertEqual(self.pa_4.encode('Hemmenway'), ('HMN', '')) + self.assertEqual(self.pa_4.encode('Henck'), ('HNK', '')) + self.assertEqual(self.pa_4.encode('Henderson'), ('HNTR', '')) + self.assertEqual(self.pa_4.encode('Hendricks'), ('HNTR', '')) + self.assertEqual(self.pa_4.encode('Hersey'), ('HRS', '')) + self.assertEqual(self.pa_4.encode('Hewes'), ('HS', '')) + self.assertEqual(self.pa_4.encode('Heyman'), ('HMN', '')) + self.assertEqual(self.pa_4.encode('Hicks'), ('HKS', '')) + self.assertEqual(self.pa_4.encode('Hidden'), ('HTN', '')) + self.assertEqual(self.pa_4.encode('Higgs'), ('HKS', '')) + self.assertEqual(self.pa_4.encode('Hill'), ('HL', '')) + self.assertEqual(self.pa_4.encode('Hills'), ('HLS', '')) + self.assertEqual(self.pa_4.encode('Hinckley'), ('HNKL', '')) + self.assertEqual(self.pa_4.encode('Hipwell'), ('HPL', '')) + self.assertEqual(self.pa_4.encode('Hobart'), ('HPRT', '')) + self.assertEqual(self.pa_4.encode('Hoben'), ('HPN', '')) + self.assertEqual(self.pa_4.encode('Hoffmann'), ('HFMN', '')) + self.assertEqual(self.pa_4.encode('Hogan'), ('HKN', '')) + self.assertEqual(self.pa_4.encode('Holmes'), ('HLMS', '')) + self.assertEqual(self.pa_4.encode('Hoo'), ('H', '')) + self.assertEqual(self.pa_4.encode('Hooker'), ('HKR', '')) + self.assertEqual(self.pa_4.encode('Hopcott'), ('HPKT', '')) + self.assertEqual(self.pa_4.encode('Hopkins'), ('HPKN', '')) + self.assertEqual(self.pa_4.encode('Hopkinson'), ('HPKN', '')) + self.assertEqual(self.pa_4.encode('Hornsey'), ('HRNS', '')) + self.assertEqual(self.pa_4.encode('Houckgeest'), ('HKJS', 'HKKS')) + self.assertEqual(self.pa_4.encode('Hough'), ('H', '')) + self.assertEqual(self.pa_4.encode('Houstin'), ('HSTN', '')) + self.assertEqual(self.pa_4.encode('How'), ('H', 'HF')) + self.assertEqual(self.pa_4.encode('Howe'), ('H', '')) + self.assertEqual(self.pa_4.encode('Howland'), ('HLNT', '')) + self.assertEqual(self.pa_4.encode('Hubner'), ('HPNR', '')) + self.assertEqual(self.pa_4.encode('Hudnut'), ('HTNT', '')) + self.assertEqual(self.pa_4.encode('Hughes'), ('HS', '')) + self.assertEqual(self.pa_4.encode('Hull'), ('HL', '')) + self.assertEqual(self.pa_4.encode('Hulme'), ('HLM', '')) + self.assertEqual(self.pa_4.encode('Hume'), ('HM', '')) + self.assertEqual(self.pa_4.encode('Hundertumark'), ('HNTR', '')) + self.assertEqual(self.pa_4.encode('Hundley'), ('HNTL', '')) + self.assertEqual(self.pa_4.encode('Hungerford'), ('HNKR', 'HNJR')) + self.assertEqual(self.pa_4.encode('Hunt'), ('HNT', '')) + self.assertEqual(self.pa_4.encode('Hurst'), ('HRST', '')) + self.assertEqual(self.pa_4.encode('Husbands'), ('HSPN', '')) + self.assertEqual(self.pa_4.encode('Hussey'), ('HS', '')) + self.assertEqual(self.pa_4.encode('Husted'), ('HSTT', '')) + self.assertEqual(self.pa_4.encode('Hutchins'), ('HXNS', '')) + self.assertEqual(self.pa_4.encode('Hutchinson'), ('HXNS', '')) + self.assertEqual(self.pa_4.encode('Huttinger'), ('HTNK', 'HTNJ')) + self.assertEqual(self.pa_4.encode('Huybertsen'), ('HPRT', '')) + self.assertEqual(self.pa_4.encode('Iddenden'), ('ATNT', '')) + self.assertEqual(self.pa_4.encode('Ingraham'), ('ANKR', '')) + self.assertEqual(self.pa_4.encode('Ives'), ('AFS', '')) + self.assertEqual(self.pa_4.encode('Jackson'), ('JKSN', 'AKSN')) + self.assertEqual(self.pa_4.encode('Jacob'), ('JKP', 'AKP')) + self.assertEqual(self.pa_4.encode('Jans'), ('JNS', 'ANS')) + self.assertEqual(self.pa_4.encode('Jenkins'), ('JNKN', 'ANKN')) + self.assertEqual(self.pa_4.encode('Jewett'), ('JT', 'AT')) + self.assertEqual(self.pa_4.encode('Jewitt'), ('JT', 'AT')) + self.assertEqual(self.pa_4.encode('Johnson'), ('JNSN', 'ANSN')) + self.assertEqual(self.pa_4.encode('Jones'), ('JNS', 'ANS')) + self.assertEqual(self.pa_4.encode('Josephine'), ('JSFN', 'HSFN')) + self.assertEqual(self.pa_4.encode('Judd'), ('JT', 'AT')) + self.assertEqual(self.pa_4.encode('June'), ('JN', 'AN')) + self.assertEqual(self.pa_4.encode('Kamarowska'), ('KMRS', '')) + self.assertEqual(self.pa_4.encode('Kay'), ('K', '')) + self.assertEqual(self.pa_4.encode('Kelley'), ('KL', '')) + self.assertEqual(self.pa_4.encode('Kelly'), ('KL', '')) + self.assertEqual(self.pa_4.encode('Keymber'), ('KMPR', '')) + self.assertEqual(self.pa_4.encode('Keynes'), ('KNS', '')) + self.assertEqual(self.pa_4.encode('Kilham'), ('KLM', '')) + self.assertEqual(self.pa_4.encode('Kim'), ('KM', '')) + self.assertEqual(self.pa_4.encode('Kimball'), ('KMPL', '')) + self.assertEqual(self.pa_4.encode('King'), ('KNK', '')) + self.assertEqual(self.pa_4.encode('Kinsey'), ('KNS', '')) + self.assertEqual(self.pa_4.encode('Kirk'), ('KRK', '')) + self.assertEqual(self.pa_4.encode('Kirton'), ('KRTN', '')) + self.assertEqual(self.pa_4.encode('Kistler'), ('KSTL', '')) + self.assertEqual(self.pa_4.encode('Kitchen'), ('KXN', '')) + self.assertEqual(self.pa_4.encode('Kitson'), ('KTSN', '')) + self.assertEqual(self.pa_4.encode('Klett'), ('KLT', '')) + self.assertEqual(self.pa_4.encode('Kline'), ('KLN', '')) + self.assertEqual(self.pa_4.encode('Knapp'), ('NP', '')) + self.assertEqual(self.pa_4.encode('Knight'), ('NT', '')) + self.assertEqual(self.pa_4.encode('Knote'), ('NT', '')) + self.assertEqual(self.pa_4.encode('Knott'), ('NT', '')) + self.assertEqual(self.pa_4.encode('Knox'), ('NKS', '')) + self.assertEqual(self.pa_4.encode('Koeller'), ('KLR', '')) + self.assertEqual(self.pa_4.encode('La Pointe'), ('LPNT', '')) + self.assertEqual(self.pa_4.encode('LaPlante'), ('LPLN', '')) + self.assertEqual(self.pa_4.encode('Laimbeer'), ('LMPR', '')) + self.assertEqual(self.pa_4.encode('Lamb'), ('LMP', '')) + self.assertEqual(self.pa_4.encode('Lambertson'), ('LMPR', '')) + self.assertEqual(self.pa_4.encode('Lancto'), ('LNKT', '')) + self.assertEqual(self.pa_4.encode('Landry'), ('LNTR', '')) + self.assertEqual(self.pa_4.encode('Lane'), ('LN', '')) + self.assertEqual(self.pa_4.encode('Langendyck'), ('LNJN', 'LNKN')) + self.assertEqual(self.pa_4.encode('Langer'), ('LNKR', 'LNJR')) + self.assertEqual(self.pa_4.encode('Langford'), ('LNKF', '')) + self.assertEqual(self.pa_4.encode('Lantersee'), ('LNTR', '')) + self.assertEqual(self.pa_4.encode('Laquer'), ('LKR', '')) + self.assertEqual(self.pa_4.encode('Larkin'), ('LRKN', '')) + self.assertEqual(self.pa_4.encode('Latham'), ('LTM', '')) + self.assertEqual(self.pa_4.encode('Lathrop'), ('L0RP', 'LTRP')) + self.assertEqual(self.pa_4.encode('Lauter'), ('LTR', '')) + self.assertEqual(self.pa_4.encode('Lawrence'), ('LRNS', '')) + self.assertEqual(self.pa_4.encode('Leach'), ('LK', '')) + self.assertEqual(self.pa_4.encode('Leager'), ('LKR', 'LJR')) + self.assertEqual(self.pa_4.encode('Learned'), ('LRNT', '')) + self.assertEqual(self.pa_4.encode('Leavitt'), ('LFT', '')) + self.assertEqual(self.pa_4.encode('Lee'), ('L', '')) + self.assertEqual(self.pa_4.encode('Leete'), ('LT', '')) + self.assertEqual(self.pa_4.encode('Leggett'), ('LKT', '')) + self.assertEqual(self.pa_4.encode('Leland'), ('LLNT', '')) + self.assertEqual(self.pa_4.encode('Leonard'), ('LNRT', '')) + self.assertEqual(self.pa_4.encode('Lester'), ('LSTR', '')) + self.assertEqual(self.pa_4.encode('Lestrange'), ('LSTR', '')) + self.assertEqual(self.pa_4.encode('Lethem'), ('L0M', 'LTM')) + self.assertEqual(self.pa_4.encode('Levine'), ('LFN', '')) + self.assertEqual(self.pa_4.encode('Lewes'), ('LS', '')) + self.assertEqual(self.pa_4.encode('Lewis'), ('LS', '')) + self.assertEqual(self.pa_4.encode('Lincoln'), ('LNKL', '')) + self.assertEqual(self.pa_4.encode('Lindsey'), ('LNTS', '')) + self.assertEqual(self.pa_4.encode('Linher'), ('LNR', '')) + self.assertEqual(self.pa_4.encode('Lippet'), ('LPT', '')) + self.assertEqual(self.pa_4.encode('Lippincott'), ('LPNK', '')) + self.assertEqual(self.pa_4.encode('Lockwood'), ('LKT', '')) + self.assertEqual(self.pa_4.encode('Loines'), ('LNS', '')) + self.assertEqual(self.pa_4.encode('Lombard'), ('LMPR', '')) + self.assertEqual(self.pa_4.encode('Long'), ('LNK', '')) + self.assertEqual(self.pa_4.encode('Longespee'), ('LNJS', 'LNKS')) + self.assertEqual(self.pa_4.encode('Look'), ('LK', '')) + self.assertEqual(self.pa_4.encode('Lounsberry'), ('LNSP', '')) + self.assertEqual(self.pa_4.encode('Lounsbury'), ('LNSP', '')) + self.assertEqual(self.pa_4.encode('Louthe'), ('L0', 'LT')) + self.assertEqual(self.pa_4.encode('Loveyne'), ('LFN', '')) + self.assertEqual(self.pa_4.encode('Lowe'), ('L', '')) + self.assertEqual(self.pa_4.encode('Ludlam'), ('LTLM', '')) + self.assertEqual(self.pa_4.encode('Lumbard'), ('LMPR', '')) + self.assertEqual(self.pa_4.encode('Lund'), ('LNT', '')) + self.assertEqual(self.pa_4.encode('Luno'), ('LN', '')) + self.assertEqual(self.pa_4.encode('Lutz'), ('LTS', '')) + self.assertEqual(self.pa_4.encode('Lydia'), ('LT', '')) + self.assertEqual(self.pa_4.encode('Lynne'), ('LN', '')) + self.assertEqual(self.pa_4.encode('Lyon'), ('LN', '')) + self.assertEqual(self.pa_4.encode('MacAlpin'), ('MKLP', '')) + self.assertEqual(self.pa_4.encode('MacBricc'), ('MKPR', '')) + self.assertEqual(self.pa_4.encode('MacCrinan'), ('MKRN', '')) + self.assertEqual(self.pa_4.encode('MacKenneth'), ('MKN0', 'MKNT')) + self.assertEqual(self.pa_4.encode('MacMael nam Bo'), ('MKML', '')) + self.assertEqual(self.pa_4.encode('MacMurchada'), ('MKMR', '')) + self.assertEqual(self.pa_4.encode('Macomber'), ('MKMP', '')) + self.assertEqual(self.pa_4.encode('Macy'), ('MS', '')) + self.assertEqual(self.pa_4.encode('Magnus'), ('MNS', 'MKNS')) + self.assertEqual(self.pa_4.encode('Mahien'), ('MHN', '')) + self.assertEqual(self.pa_4.encode('Malmains'), ('MLMN', '')) + self.assertEqual(self.pa_4.encode('Malory'), ('MLR', '')) + self.assertEqual(self.pa_4.encode('Mancinelli'), ('MNSN', '')) + self.assertEqual(self.pa_4.encode('Mancini'), ('MNSN', '')) + self.assertEqual(self.pa_4.encode('Mann'), ('MN', '')) + self.assertEqual(self.pa_4.encode('Manning'), ('MNNK', '')) + self.assertEqual(self.pa_4.encode('Manter'), ('MNTR', '')) + self.assertEqual(self.pa_4.encode('Marion'), ('MRN', '')) + self.assertEqual(self.pa_4.encode('Marley'), ('MRL', '')) + self.assertEqual(self.pa_4.encode('Marmion'), ('MRMN', '')) + self.assertEqual(self.pa_4.encode('Marquart'), ('MRKR', '')) + self.assertEqual(self.pa_4.encode('Marsh'), ('MRX', '')) + self.assertEqual(self.pa_4.encode('Marshal'), ('MRXL', '')) + self.assertEqual(self.pa_4.encode('Marshall'), ('MRXL', '')) + self.assertEqual(self.pa_4.encode('Martel'), ('MRTL', '')) + self.assertEqual(self.pa_4.encode('Martha'), ('MR0', 'MRT')) + self.assertEqual(self.pa_4.encode('Martin'), ('MRTN', '')) + self.assertEqual(self.pa_4.encode('Marturano'), ('MRTR', '')) + self.assertEqual(self.pa_4.encode('Marvin'), ('MRFN', '')) + self.assertEqual(self.pa_4.encode('Mary'), ('MR', '')) + self.assertEqual(self.pa_4.encode('Mason'), ('MSN', '')) + self.assertEqual(self.pa_4.encode('Maxwell'), ('MKSL', '')) + self.assertEqual(self.pa_4.encode('Mayhew'), ('MH', 'MHF')) + self.assertEqual(self.pa_4.encode('McAllaster'), ('MKLS', '')) + self.assertEqual(self.pa_4.encode('McAllister'), ('MKLS', '')) + self.assertEqual(self.pa_4.encode('McConnell'), ('MKNL', '')) + self.assertEqual(self.pa_4.encode('McFarland'), ('MKFR', '')) + self.assertEqual(self.pa_4.encode('McIlroy'), ('MSLR', '')) + self.assertEqual(self.pa_4.encode('McNair'), ('MKNR', '')) + self.assertEqual(self.pa_4.encode('McNair-Landry'), ('MKNR', '')) + self.assertEqual(self.pa_4.encode('McRaven'), ('MKRF', '')) + self.assertEqual(self.pa_4.encode('Mead'), ('MT', '')) + self.assertEqual(self.pa_4.encode('Meade'), ('MT', '')) + self.assertEqual(self.pa_4.encode('Meck'), ('MK', '')) + self.assertEqual(self.pa_4.encode('Melton'), ('MLTN', '')) + self.assertEqual(self.pa_4.encode('Mendenhall'), ('MNTN', '')) + self.assertEqual(self.pa_4.encode('Mering'), ('MRNK', '')) + self.assertEqual(self.pa_4.encode('Merrick'), ('MRK', '')) + self.assertEqual(self.pa_4.encode('Merry'), ('MR', '')) + self.assertEqual(self.pa_4.encode('Mighill'), ('ML', '')) + self.assertEqual(self.pa_4.encode('Miller'), ('MLR', '')) + self.assertEqual(self.pa_4.encode('Milton'), ('MLTN', '')) + self.assertEqual(self.pa_4.encode('Mohun'), ('MHN', '')) + self.assertEqual(self.pa_4.encode('Montague'), ('MNTK', '')) + self.assertEqual(self.pa_4.encode('Montboucher'), ('MNTP', '')) + self.assertEqual(self.pa_4.encode('Moore'), ('MR', '')) + self.assertEqual(self.pa_4.encode('Morrel'), ('MRL', '')) + self.assertEqual(self.pa_4.encode('Morrill'), ('MRL', '')) + self.assertEqual(self.pa_4.encode('Morris'), ('MRS', '')) + self.assertEqual(self.pa_4.encode('Morton'), ('MRTN', '')) + self.assertEqual(self.pa_4.encode('Moton'), ('MTN', '')) + self.assertEqual(self.pa_4.encode('Muir'), ('MR', '')) + self.assertEqual(self.pa_4.encode('Mulferd'), ('MLFR', '')) + self.assertEqual(self.pa_4.encode('Mullins'), ('MLNS', '')) + self.assertEqual(self.pa_4.encode('Mulso'), ('MLS', '')) + self.assertEqual(self.pa_4.encode('Munger'), ('MNKR', 'MNJR')) + self.assertEqual(self.pa_4.encode('Munt'), ('MNT', '')) + self.assertEqual(self.pa_4.encode('Murchad'), ('MRXT', 'MRKT')) + self.assertEqual(self.pa_4.encode('Murdock'), ('MRTK', '')) + self.assertEqual(self.pa_4.encode('Murray'), ('MR', '')) + self.assertEqual(self.pa_4.encode('Muskett'), ('MSKT', '')) + self.assertEqual(self.pa_4.encode('Myers'), ('MRS', '')) + self.assertEqual(self.pa_4.encode('Myrick'), ('MRK', '')) + self.assertEqual(self.pa_4.encode('NORRIS'), ('NRS', '')) + self.assertEqual(self.pa_4.encode('Nayle'), ('NL', '')) + self.assertEqual(self.pa_4.encode('Newcomb'), ('NKMP', '')) + self.assertEqual(self.pa_4.encode('Newcomb(e)'), ('NKMP', '')) + self.assertEqual(self.pa_4.encode('Newkirk'), ('NKRK', '')) + self.assertEqual(self.pa_4.encode('Newton'), ('NTN', '')) + self.assertEqual(self.pa_4.encode('Niles'), ('NLS', '')) + self.assertEqual(self.pa_4.encode('Noble'), ('NPL', '')) + self.assertEqual(self.pa_4.encode('Noel'), ('NL', '')) + self.assertEqual(self.pa_4.encode('Northend'), ('NR0N', 'NRTN')) + self.assertEqual(self.pa_4.encode('Norton'), ('NRTN', '')) + self.assertEqual(self.pa_4.encode('Nutter'), ('NTR', '')) + self.assertEqual(self.pa_4.encode('Odding'), ('ATNK', '')) + self.assertEqual(self.pa_4.encode('Odenbaugh'), ('ATNP', '')) + self.assertEqual(self.pa_4.encode('Ogborn'), ('AKPR', '')) + self.assertEqual(self.pa_4.encode('Oppenheimer'), ('APNM', '')) + self.assertEqual(self.pa_4.encode('Otis'), ('ATS', '')) + self.assertEqual(self.pa_4.encode('Oviatt'), ('AFT', '')) + self.assertEqual(self.pa_4.encode('PRUST?'), ('PRST', '')) + self.assertEqual(self.pa_4.encode('Paddock'), ('PTK', '')) + self.assertEqual(self.pa_4.encode('Page'), ('PJ', 'PK')) + self.assertEqual(self.pa_4.encode('Paine'), ('PN', '')) + self.assertEqual(self.pa_4.encode('Paist'), ('PST', '')) + self.assertEqual(self.pa_4.encode('Palmer'), ('PLMR', '')) + self.assertEqual(self.pa_4.encode('Park'), ('PRK', '')) + self.assertEqual(self.pa_4.encode('Parker'), ('PRKR', '')) + self.assertEqual(self.pa_4.encode('Parkhurst'), ('PRKR', '')) + self.assertEqual(self.pa_4.encode('Parrat'), ('PRT', '')) + self.assertEqual(self.pa_4.encode('Parsons'), ('PRSN', '')) + self.assertEqual(self.pa_4.encode('Partridge'), ('PRTR', '')) + self.assertEqual(self.pa_4.encode('Pashley'), ('PXL', '')) + self.assertEqual(self.pa_4.encode('Pasley'), ('PSL', '')) + self.assertEqual(self.pa_4.encode('Patrick'), ('PTRK', '')) + self.assertEqual(self.pa_4.encode('Pattee'), ('PT', '')) + self.assertEqual(self.pa_4.encode('Patten'), ('PTN', '')) + self.assertEqual(self.pa_4.encode('Pawley'), ('PL', '')) + self.assertEqual(self.pa_4.encode('Payne'), ('PN', '')) + self.assertEqual(self.pa_4.encode('Peabody'), ('PPT', '')) + self.assertEqual(self.pa_4.encode('Peake'), ('PK', '')) + self.assertEqual(self.pa_4.encode('Pearson'), ('PRSN', '')) + self.assertEqual(self.pa_4.encode('Peat'), ('PT', '')) + self.assertEqual(self.pa_4.encode('Pedersen'), ('PTRS', '')) + self.assertEqual(self.pa_4.encode('Percy'), ('PRS', '')) + self.assertEqual(self.pa_4.encode('Perkins'), ('PRKN', '')) + self.assertEqual(self.pa_4.encode('Perrine'), ('PRN', '')) + self.assertEqual(self.pa_4.encode('Perry'), ('PR', '')) + self.assertEqual(self.pa_4.encode('Peson'), ('PSN', '')) + self.assertEqual(self.pa_4.encode('Peterson'), ('PTRS', '')) + self.assertEqual(self.pa_4.encode('Peyton'), ('PTN', '')) + self.assertEqual(self.pa_4.encode('Phinney'), ('FN', '')) + self.assertEqual(self.pa_4.encode('Pickard'), ('PKRT', '')) + self.assertEqual(self.pa_4.encode('Pierce'), ('PRS', '')) + self.assertEqual(self.pa_4.encode('Pierrepont'), ('PRPN', '')) + self.assertEqual(self.pa_4.encode('Pike'), ('PK', '')) + self.assertEqual(self.pa_4.encode('Pinkham'), ('PNKM', '')) + self.assertEqual(self.pa_4.encode('Pitman'), ('PTMN', '')) + self.assertEqual(self.pa_4.encode('Pitt'), ('PT', '')) + self.assertEqual(self.pa_4.encode('Pitts'), ('PTS', '')) + self.assertEqual(self.pa_4.encode('Plantagenet'), ('PLNT', '')) + self.assertEqual(self.pa_4.encode('Platt'), ('PLT', '')) + self.assertEqual(self.pa_4.encode('Platts'), ('PLTS', '')) + self.assertEqual(self.pa_4.encode('Pleis'), ('PLS', '')) + self.assertEqual(self.pa_4.encode('Pleiss'), ('PLS', '')) + self.assertEqual(self.pa_4.encode('Plisko'), ('PLSK', '')) + self.assertEqual(self.pa_4.encode('Pliskovitch'), ('PLSK', '')) + self.assertEqual(self.pa_4.encode('Plum'), ('PLM', '')) + self.assertEqual(self.pa_4.encode('Plume'), ('PLM', '')) + self.assertEqual(self.pa_4.encode('Poitou'), ('PT', '')) + self.assertEqual(self.pa_4.encode('Pomeroy'), ('PMR', '')) + self.assertEqual(self.pa_4.encode('Poretiers'), ('PRTR', '')) + self.assertEqual(self.pa_4.encode('Pote'), ('PT', '')) + self.assertEqual(self.pa_4.encode('Potter'), ('PTR', '')) + self.assertEqual(self.pa_4.encode('Potts'), ('PTS', '')) + self.assertEqual(self.pa_4.encode('Powell'), ('PL', '')) + self.assertEqual(self.pa_4.encode('Pratt'), ('PRT', '')) + self.assertEqual(self.pa_4.encode('Presbury'), ('PRSP', '')) + self.assertEqual(self.pa_4.encode('Priest'), ('PRST', '')) + self.assertEqual(self.pa_4.encode('Prindle'), ('PRNT', '')) + self.assertEqual(self.pa_4.encode('Prior'), ('PRR', '')) + self.assertEqual(self.pa_4.encode('Profumo'), ('PRFM', '')) + self.assertEqual(self.pa_4.encode('Purdy'), ('PRT', '')) + self.assertEqual(self.pa_4.encode('Purefoy'), ('PRF', '')) + self.assertEqual(self.pa_4.encode('Pury'), ('PR', '')) + self.assertEqual(self.pa_4.encode('Quinter'), ('KNTR', '')) + self.assertEqual(self.pa_4.encode('Rachel'), ('RXL', 'RKL')) + self.assertEqual(self.pa_4.encode('Rand'), ('RNT', '')) + self.assertEqual(self.pa_4.encode('Rankin'), ('RNKN', '')) + self.assertEqual(self.pa_4.encode('Ravenscroft'), ('RFNS', '')) + self.assertEqual(self.pa_4.encode('Raynsford'), ('RNSF', '')) + self.assertEqual(self.pa_4.encode('Reakirt'), ('RKRT', '')) + self.assertEqual(self.pa_4.encode('Reaves'), ('RFS', '')) + self.assertEqual(self.pa_4.encode('Reeves'), ('RFS', '')) + self.assertEqual(self.pa_4.encode('Reichert'), ('RXRT', 'RKRT')) + self.assertEqual(self.pa_4.encode('Remmele'), ('RML', '')) + self.assertEqual(self.pa_4.encode('Reynolds'), ('RNLT', '')) + self.assertEqual(self.pa_4.encode('Rhodes'), ('RTS', '')) + self.assertEqual(self.pa_4.encode('Richards'), ('RXRT', 'RKRT')) + self.assertEqual(self.pa_4.encode('Richardson'), ('RXRT', 'RKRT')) + self.assertEqual(self.pa_4.encode('Ring'), ('RNK', '')) + self.assertEqual(self.pa_4.encode('Roberts'), ('RPRT', '')) + self.assertEqual(self.pa_4.encode('Robertson'), ('RPRT', '')) + self.assertEqual(self.pa_4.encode('Robson'), ('RPSN', '')) + self.assertEqual(self.pa_4.encode('Rodie'), ('RT', '')) + self.assertEqual(self.pa_4.encode('Rody'), ('RT', '')) + self.assertEqual(self.pa_4.encode('Rogers'), ('RKRS', 'RJRS')) + self.assertEqual(self.pa_4.encode('Ross'), ('RS', '')) + self.assertEqual(self.pa_4.encode('Rosslevin'), ('RSLF', '')) + self.assertEqual(self.pa_4.encode('Rowland'), ('RLNT', '')) + self.assertEqual(self.pa_4.encode('Ruehl'), ('RL', '')) + self.assertEqual(self.pa_4.encode('Russell'), ('RSL', '')) + self.assertEqual(self.pa_4.encode('Ruth'), ('R0', 'RT')) + self.assertEqual(self.pa_4.encode('Ryan'), ('RN', '')) + self.assertEqual(self.pa_4.encode('Rysse'), ('RS', '')) + self.assertEqual(self.pa_4.encode('Sadler'), ('STLR', '')) + self.assertEqual(self.pa_4.encode('Salmon'), ('SLMN', '')) + self.assertEqual(self.pa_4.encode('Salter'), ('SLTR', '')) + self.assertEqual(self.pa_4.encode('Salvatore'), ('SLFT', '')) + self.assertEqual(self.pa_4.encode('Sanders'), ('SNTR', '')) + self.assertEqual(self.pa_4.encode('Sands'), ('SNTS', '')) + self.assertEqual(self.pa_4.encode('Sanford'), ('SNFR', '')) + self.assertEqual(self.pa_4.encode('Sanger'), ('SNKR', 'SNJR')) + self.assertEqual(self.pa_4.encode('Sargent'), ('SRJN', 'SRKN')) + self.assertEqual(self.pa_4.encode('Saunders'), ('SNTR', '')) + self.assertEqual(self.pa_4.encode('Schilling'), ('XLNK', '')) + self.assertEqual(self.pa_4.encode('Schlegel'), ('XLKL', 'SLKL')) + self.assertEqual(self.pa_4.encode('Scott'), ('SKT', '')) + self.assertEqual(self.pa_4.encode('Sears'), ('SRS', '')) + self.assertEqual(self.pa_4.encode('Segersall'), ('SJRS', 'SKRS')) + self.assertEqual(self.pa_4.encode('Senecal'), ('SNKL', '')) + self.assertEqual(self.pa_4.encode('Sergeaux'), ('SRJ', 'SRK')) + self.assertEqual(self.pa_4.encode('Severance'), ('SFRN', '')) + self.assertEqual(self.pa_4.encode('Sharp'), ('XRP', '')) + self.assertEqual(self.pa_4.encode('Sharpe'), ('XRP', '')) + self.assertEqual(self.pa_4.encode('Sharply'), ('XRPL', '')) + self.assertEqual(self.pa_4.encode('Shatswell'), ('XTSL', '')) + self.assertEqual(self.pa_4.encode('Shattack'), ('XTK', '')) + self.assertEqual(self.pa_4.encode('Shattock'), ('XTK', '')) + self.assertEqual(self.pa_4.encode('Shattuck'), ('XTK', '')) + self.assertEqual(self.pa_4.encode('Shaw'), ('X', 'XF')) + self.assertEqual(self.pa_4.encode('Sheldon'), ('XLTN', '')) + self.assertEqual(self.pa_4.encode('Sherman'), ('XRMN', '')) + self.assertEqual(self.pa_4.encode('Shinn'), ('XN', '')) + self.assertEqual(self.pa_4.encode('Shirford'), ('XRFR', '')) + self.assertEqual(self.pa_4.encode('Shirley'), ('XRL', '')) + self.assertEqual(self.pa_4.encode('Shively'), ('XFL', '')) + self.assertEqual(self.pa_4.encode('Shoemaker'), ('XMKR', '')) + self.assertEqual(self.pa_4.encode('Short'), ('XRT', '')) + self.assertEqual(self.pa_4.encode('Shotwell'), ('XTL', '')) + self.assertEqual(self.pa_4.encode('Shute'), ('XT', '')) + self.assertEqual(self.pa_4.encode('Sibley'), ('SPL', '')) + self.assertEqual(self.pa_4.encode('Silver'), ('SLFR', '')) + self.assertEqual(self.pa_4.encode('Simes'), ('SMS', '')) + self.assertEqual(self.pa_4.encode('Sinken'), ('SNKN', '')) + self.assertEqual(self.pa_4.encode('Sinn'), ('SN', '')) + self.assertEqual(self.pa_4.encode('Skelton'), ('SKLT', '')) + self.assertEqual(self.pa_4.encode('Skiffe'), ('SKF', '')) + self.assertEqual(self.pa_4.encode('Skotkonung'), ('SKTK', '')) + self.assertEqual(self.pa_4.encode('Slade'), ('SLT', 'XLT')) + self.assertEqual(self.pa_4.encode('Slye'), ('SL', 'XL')) + self.assertEqual(self.pa_4.encode('Smedley'), ('SMTL', 'XMTL')) + self.assertEqual(self.pa_4.encode('Smith'), ('SM0', 'XMT')) + self.assertEqual(self.pa_4.encode('Snow'), ('SN', 'XNF')) + self.assertEqual(self.pa_4.encode('Soole'), ('SL', '')) + self.assertEqual(self.pa_4.encode('Soule'), ('SL', '')) + self.assertEqual(self.pa_4.encode('Southworth'), ('S0R0', 'STRT')) + self.assertEqual(self.pa_4.encode('Sowles'), ('SLS', '')) + self.assertEqual(self.pa_4.encode('Spalding'), ('SPLT', '')) + self.assertEqual(self.pa_4.encode('Spark'), ('SPRK', '')) + self.assertEqual(self.pa_4.encode('Spencer'), ('SPNS', '')) + self.assertEqual(self.pa_4.encode('Sperry'), ('SPR', '')) + self.assertEqual(self.pa_4.encode('Spofford'), ('SPFR', '')) + self.assertEqual(self.pa_4.encode('Spooner'), ('SPNR', '')) + self.assertEqual(self.pa_4.encode('Sprague'), ('SPRK', '')) + self.assertEqual(self.pa_4.encode('Springer'), ('SPRN', '')) + self.assertEqual(self.pa_4.encode('St. Clair'), ('STKL', '')) + self.assertEqual(self.pa_4.encode('St. Claire'), ('STKL', '')) + self.assertEqual(self.pa_4.encode('St. Leger'), ('STLJ', 'STLK')) + self.assertEqual(self.pa_4.encode('St. Omer'), ('STMR', '')) + self.assertEqual(self.pa_4.encode('Stafferton'), ('STFR', '')) + self.assertEqual(self.pa_4.encode('Stafford'), ('STFR', '')) + self.assertEqual(self.pa_4.encode('Stalham'), ('STLM', '')) + self.assertEqual(self.pa_4.encode('Stanford'), ('STNF', '')) + self.assertEqual(self.pa_4.encode('Stanton'), ('STNT', '')) + self.assertEqual(self.pa_4.encode('Star'), ('STR', '')) + self.assertEqual(self.pa_4.encode('Starbuck'), ('STRP', '')) + self.assertEqual(self.pa_4.encode('Starkey'), ('STRK', '')) + self.assertEqual(self.pa_4.encode('Starkweather'), ('STRK', '')) + self.assertEqual(self.pa_4.encode('Stearns'), ('STRN', '')) + self.assertEqual(self.pa_4.encode('Stebbins'), ('STPN', '')) + self.assertEqual(self.pa_4.encode('Steele'), ('STL', '')) + self.assertEqual(self.pa_4.encode('Stephenson'), ('STFN', '')) + self.assertEqual(self.pa_4.encode('Stevens'), ('STFN', '')) + self.assertEqual(self.pa_4.encode('Stoddard'), ('STTR', '')) + self.assertEqual(self.pa_4.encode('Stodder'), ('STTR', '')) + self.assertEqual(self.pa_4.encode('Stone'), ('STN', '')) + self.assertEqual(self.pa_4.encode('Storey'), ('STR', '')) + self.assertEqual(self.pa_4.encode('Storrada'), ('STRT', '')) + self.assertEqual(self.pa_4.encode('Story'), ('STR', '')) + self.assertEqual(self.pa_4.encode('Stoughton'), ('STFT', '')) + self.assertEqual(self.pa_4.encode('Stout'), ('STT', '')) + self.assertEqual(self.pa_4.encode('Stow'), ('ST', 'STF')) + self.assertEqual(self.pa_4.encode('Strong'), ('STRN', '')) + self.assertEqual(self.pa_4.encode('Strutt'), ('STRT', '')) + self.assertEqual(self.pa_4.encode('Stryker'), ('STRK', '')) + self.assertEqual(self.pa_4.encode('Stuckeley'), ('STKL', '')) + self.assertEqual(self.pa_4.encode('Sturges'), ('STRJ', 'STRK')) + self.assertEqual(self.pa_4.encode('Sturgess'), ('STRJ', 'STRK')) + self.assertEqual(self.pa_4.encode('Sturgis'), ('STRJ', 'STRK')) + self.assertEqual(self.pa_4.encode('Suevain'), ('SFN', '')) + self.assertEqual(self.pa_4.encode('Sulyard'), ('SLRT', '')) + self.assertEqual(self.pa_4.encode('Sutton'), ('STN', '')) + self.assertEqual(self.pa_4.encode('Swain'), ('SN', 'XN')) + self.assertEqual(self.pa_4.encode('Swayne'), ('SN', 'XN')) + self.assertEqual(self.pa_4.encode('Swayze'), ('SS', 'XTS')) + self.assertEqual(self.pa_4.encode('Swift'), ('SFT', 'XFT')) + self.assertEqual(self.pa_4.encode('Taber'), ('TPR', '')) + self.assertEqual(self.pa_4.encode('Talcott'), ('TLKT', '')) + self.assertEqual(self.pa_4.encode('Tarne'), ('TRN', '')) + self.assertEqual(self.pa_4.encode('Tatum'), ('TTM', '')) + self.assertEqual(self.pa_4.encode('Taverner'), ('TFRN', '')) + self.assertEqual(self.pa_4.encode('Taylor'), ('TLR', '')) + self.assertEqual(self.pa_4.encode('Tenney'), ('TN', '')) + self.assertEqual(self.pa_4.encode('Thayer'), ('0R', 'TR')) + self.assertEqual(self.pa_4.encode('Thember'), ('0MPR', 'TMPR')) + self.assertEqual(self.pa_4.encode('Thomas'), ('TMS', '')) + self.assertEqual(self.pa_4.encode('Thompson'), ('TMPS', '')) + self.assertEqual(self.pa_4.encode('Thorne'), ('0RN', 'TRN')) + self.assertEqual(self.pa_4.encode('Thornycraft'), ('0RNK', 'TRNK')) + self.assertEqual(self.pa_4.encode('Threlkeld'), ('0RLK', 'TRLK')) + self.assertEqual(self.pa_4.encode('Throckmorton'), ('0RKM', 'TRKM')) + self.assertEqual(self.pa_4.encode('Thwaits'), ('0TS', 'TTS')) + self.assertEqual(self.pa_4.encode('Tibbetts'), ('TPTS', '')) + self.assertEqual(self.pa_4.encode('Tidd'), ('TT', '')) + self.assertEqual(self.pa_4.encode('Tierney'), ('TRN', '')) + self.assertEqual(self.pa_4.encode('Tilley'), ('TL', '')) + self.assertEqual(self.pa_4.encode('Tillieres'), ('TLRS', '')) + self.assertEqual(self.pa_4.encode('Tilly'), ('TL', '')) + self.assertEqual(self.pa_4.encode('Tisdale'), ('TSTL', '')) + self.assertEqual(self.pa_4.encode('Titus'), ('TTS', '')) + self.assertEqual(self.pa_4.encode('Tobey'), ('TP', '')) + self.assertEqual(self.pa_4.encode('Tooker'), ('TKR', '')) + self.assertEqual(self.pa_4.encode('Towle'), ('TL', '')) + self.assertEqual(self.pa_4.encode('Towne'), ('TN', '')) + self.assertEqual(self.pa_4.encode('Townsend'), ('TNSN', '')) + self.assertEqual(self.pa_4.encode('Treadway'), ('TRT', '')) + self.assertEqual(self.pa_4.encode('Trelawney'), ('TRLN', '')) + self.assertEqual(self.pa_4.encode('Trinder'), ('TRNT', '')) + self.assertEqual(self.pa_4.encode('Tripp'), ('TRP', '')) + self.assertEqual(self.pa_4.encode('Trippe'), ('TRP', '')) + self.assertEqual(self.pa_4.encode('Trott'), ('TRT', '')) + self.assertEqual(self.pa_4.encode('True'), ('TR', '')) + self.assertEqual(self.pa_4.encode('Trussebut'), ('TRSP', '')) + self.assertEqual(self.pa_4.encode('Tucker'), ('TKR', '')) + self.assertEqual(self.pa_4.encode('Turgeon'), ('TRJN', 'TRKN')) + self.assertEqual(self.pa_4.encode('Turner'), ('TRNR', '')) + self.assertEqual(self.pa_4.encode('Tuttle'), ('TTL', '')) + self.assertEqual(self.pa_4.encode('Tyler'), ('TLR', '')) + self.assertEqual(self.pa_4.encode('Tylle'), ('TL', '')) + self.assertEqual(self.pa_4.encode('Tyrrel'), ('TRL', '')) + self.assertEqual(self.pa_4.encode('Ua Tuathail'), ('AT0L', 'ATTL')) + self.assertEqual(self.pa_4.encode('Ulrich'), ('ALRX', 'ALRK')) + self.assertEqual(self.pa_4.encode('Underhill'), ('ANTR', '')) + self.assertEqual(self.pa_4.encode('Underwood'), ('ANTR', '')) + self.assertEqual(self.pa_4.encode('Unknown'), ('ANKN', '')) + self.assertEqual(self.pa_4.encode('Valentine'), ('FLNT', '')) + self.assertEqual(self.pa_4.encode('Van Egmond'), ('FNKM', '')) + self.assertEqual(self.pa_4.encode('Van der Beek'), ('FNTR', '')) + self.assertEqual(self.pa_4.encode('Vaughan'), ('FKN', '')) + self.assertEqual(self.pa_4.encode('Vermenlen'), ('FRMN', '')) + self.assertEqual(self.pa_4.encode('Vincent'), ('FNSN', '')) + self.assertEqual(self.pa_4.encode('Volentine'), ('FLNT', '')) + self.assertEqual(self.pa_4.encode('Wagner'), ('AKNR', 'FKNR')) + self.assertEqual(self.pa_4.encode('Waite'), ('AT', 'FT')) + self.assertEqual(self.pa_4.encode('Walker'), ('ALKR', 'FLKR')) + self.assertEqual(self.pa_4.encode('Walter'), ('ALTR', 'FLTR')) + self.assertEqual(self.pa_4.encode('Wandell'), ('ANTL', 'FNTL')) + self.assertEqual(self.pa_4.encode('Wandesford'), ('ANTS', 'FNTS')) + self.assertEqual(self.pa_4.encode('Warbleton'), ('ARPL', 'FRPL')) + self.assertEqual(self.pa_4.encode('Ward'), ('ART', 'FRT')) + self.assertEqual(self.pa_4.encode('Warde'), ('ART', 'FRT')) + self.assertEqual(self.pa_4.encode('Ware'), ('AR', 'FR')) + self.assertEqual(self.pa_4.encode('Wareham'), ('ARHM', 'FRHM')) + self.assertEqual(self.pa_4.encode('Warner'), ('ARNR', 'FRNR')) + self.assertEqual(self.pa_4.encode('Warren'), ('ARN', 'FRN')) + self.assertEqual(self.pa_4.encode('Washburne'), ('AXPR', 'FXPR')) + self.assertEqual(self.pa_4.encode('Waterbury'), ('ATRP', 'FTRP')) + self.assertEqual(self.pa_4.encode('Watson'), ('ATSN', 'FTSN')) self.assertEqual( - self.pa.encode('of the Tributes', 4), ('AF0T', 'AFTT') + self.pa_4.encode('WatsonEllithorpe'), ('ATSN', 'FTSN') ) - self.assertEqual(self.pa.encode('unknown', 4), ('ANKN', '')) - self.assertEqual(self.pa.encode('van der Gouda', 4), ('FNTR', '')) - self.assertEqual(self.pa.encode('von Adenbaugh', 4), ('FNTN', '')) - self.assertEqual(self.pa.encode('ARCHITure', 4), ('ARKT', '')) - self.assertEqual(self.pa.encode('Arnoff', 4), ('ARNF', '')) - self.assertEqual(self.pa.encode('Arnow', 4), ('ARN', 'ARNF')) - self.assertEqual(self.pa.encode('DANGER', 4), ('TNJR', 'TNKR')) - self.assertEqual(self.pa.encode('Jankelowicz', 4), ('JNKL', 'ANKL')) - self.assertEqual(self.pa.encode('MANGER', 4), ('MNJR', 'MNKR')) - self.assertEqual(self.pa.encode('McClellan', 4), ('MKLL', '')) - self.assertEqual(self.pa.encode('McHugh', 4), ('MK', '')) - self.assertEqual(self.pa.encode('McLaughlin', 4), ('MKLF', '')) - self.assertEqual(self.pa.encode('ORCHEStra', 4), ('ARKS', '')) - self.assertEqual(self.pa.encode('ORCHID', 4), ('ARKT', '')) - self.assertEqual(self.pa.encode('Pierce', 4), ('PRS', '')) - self.assertEqual(self.pa.encode('RANGER', 4), ('RNJR', 'RNKR')) - self.assertEqual(self.pa.encode('Schlesinger', 4), ('XLSN', 'SLSN')) - self.assertEqual(self.pa.encode('Uomo', 4), ('AM', '')) - self.assertEqual(self.pa.encode('Vasserman', 4), ('FSRM', '')) - self.assertEqual(self.pa.encode('Wasserman', 4), ('ASRM', 'FSRM')) - self.assertEqual(self.pa.encode('Womo', 4), ('AM', 'FM')) - self.assertEqual(self.pa.encode('Yankelovich', 4), ('ANKL', '')) - self.assertEqual(self.pa.encode('accede', 4), ('AKST', '')) - self.assertEqual(self.pa.encode('accident', 4), ('AKST', '')) - self.assertEqual(self.pa.encode('adelsheim', 4), ('ATLS', '')) - self.assertEqual(self.pa.encode('aged', 4), ('AJT', 'AKT')) - self.assertEqual(self.pa.encode('ageless', 4), ('AJLS', 'AKLS')) - self.assertEqual(self.pa.encode('agency', 4), ('AJNS', 'AKNS')) - self.assertEqual(self.pa.encode('aghast', 4), ('AKST', '')) - self.assertEqual(self.pa.encode('agio', 4), ('AJ', 'AK')) - self.assertEqual(self.pa.encode('agrimony', 4), ('AKRM', '')) - self.assertEqual(self.pa.encode('album', 4), ('ALPM', '')) - self.assertEqual(self.pa.encode('alcmene', 4), ('ALKM', '')) - self.assertEqual(self.pa.encode('alehouse', 4), ('ALHS', '')) - self.assertEqual(self.pa.encode('antique', 4), ('ANTK', '')) - self.assertEqual(self.pa.encode('artois', 4), ('ART', 'ARTS')) - self.assertEqual(self.pa.encode('automation', 4), ('ATMX', '')) - self.assertEqual(self.pa.encode('bacchus', 4), ('PKS', '')) - self.assertEqual(self.pa.encode('bacci', 4), ('PX', '')) - self.assertEqual(self.pa.encode('bajador', 4), ('PJTR', 'PHTR')) - self.assertEqual(self.pa.encode('bellocchio', 4), ('PLX', '')) - self.assertEqual(self.pa.encode('bertucci', 4), ('PRTX', '')) - self.assertEqual(self.pa.encode('biaggi', 4), ('PJ', 'PK')) - self.assertEqual(self.pa.encode('bough', 4), ('P', '')) - self.assertEqual(self.pa.encode('breaux', 4), ('PR', '')) - self.assertEqual(self.pa.encode('broughton', 4), ('PRTN', '')) - self.assertEqual(self.pa.encode('cabrillo', 4), ('KPRL', 'KPR')) - self.assertEqual(self.pa.encode('caesar', 4), ('SSR', '')) - self.assertEqual(self.pa.encode('cagney', 4), ('KKN', '')) - self.assertEqual(self.pa.encode('campbell', 4), ('KMPL', '')) - self.assertEqual(self.pa.encode('carlisle', 4), ('KRLL', '')) - self.assertEqual(self.pa.encode('carlysle', 4), ('KRLL', '')) - self.assertEqual(self.pa.encode('chemistry', 4), ('KMST', '')) - self.assertEqual(self.pa.encode('chianti', 4), ('KNT', '')) - self.assertEqual(self.pa.encode('chorus', 4), ('KRS', '')) - self.assertEqual(self.pa.encode('cough', 4), ('KF', '')) - self.assertEqual(self.pa.encode('czerny', 4), ('SRN', 'XRN')) - self.assertEqual(self.pa.encode('deffenbacher', 4), ('TFNP', '')) - self.assertEqual(self.pa.encode('dumb', 4), ('TM', '')) - self.assertEqual(self.pa.encode('edgar', 4), ('ATKR', '')) - self.assertEqual(self.pa.encode('edge', 4), ('AJ', '')) - self.assertEqual(self.pa.encode('filipowicz', 4), ('FLPT', 'FLPF')) - self.assertEqual(self.pa.encode('focaccia', 4), ('FKX', '')) - self.assertEqual(self.pa.encode('gallegos', 4), ('KLKS', 'KKS')) - self.assertEqual(self.pa.encode('gambrelli', 4), ('KMPR', '')) - self.assertEqual(self.pa.encode('geithain', 4), ('K0N', 'JTN')) - self.assertEqual(self.pa.encode('ghiradelli', 4), ('JRTL', '')) - self.assertEqual(self.pa.encode('ghislane', 4), ('JLN', '')) - self.assertEqual(self.pa.encode('gough', 4), ('KF', '')) - self.assertEqual(self.pa.encode('hartheim', 4), ('HR0M', 'HRTM')) - self.assertEqual(self.pa.encode('heimsheim', 4), ('HMSM', '')) - self.assertEqual(self.pa.encode('hochmeier', 4), ('HKMR', '')) - self.assertEqual(self.pa.encode('hugh', 4), ('H', '')) - self.assertEqual(self.pa.encode('hunger', 4), ('HNKR', 'HNJR')) - self.assertEqual(self.pa.encode('hungry', 4), ('HNKR', '')) - self.assertEqual(self.pa.encode('island', 4), ('ALNT', '')) - self.assertEqual(self.pa.encode('isle', 4), ('AL', '')) - self.assertEqual(self.pa.encode('jose', 4), ('HS', '')) - self.assertEqual(self.pa.encode('laugh', 4), ('LF', '')) - self.assertEqual(self.pa.encode('mac caffrey', 4), ('MKFR', '')) - self.assertEqual(self.pa.encode('mac gregor', 4), ('MKRK', '')) - self.assertEqual(self.pa.encode('pegnitz', 4), ('PNTS', 'PKNT')) - self.assertEqual(self.pa.encode('piskowitz', 4), ('PSKT', 'PSKF')) - self.assertEqual(self.pa.encode('queen', 4), ('KN', '')) - self.assertEqual(self.pa.encode('raspberry', 4), ('RSPR', '')) - self.assertEqual(self.pa.encode('resnais', 4), ('RSN', 'RSNS')) - self.assertEqual(self.pa.encode('rogier', 4), ('RJ', 'RJR')) - self.assertEqual(self.pa.encode('rough', 4), ('RF', '')) - self.assertEqual(self.pa.encode('san jacinto', 4), ('SNHS', '')) - self.assertEqual(self.pa.encode('schenker', 4), ('XNKR', 'SKNK')) - self.assertEqual(self.pa.encode('schermerhorn', 4), ('XRMR', 'SKRM')) - self.assertEqual(self.pa.encode('schmidt', 4), ('XMT', 'SMT')) - self.assertEqual(self.pa.encode('schneider', 4), ('XNTR', 'SNTR')) - self.assertEqual(self.pa.encode('school', 4), ('SKL', '')) - self.assertEqual(self.pa.encode('schooner', 4), ('SKNR', '')) - self.assertEqual(self.pa.encode('schrozberg', 4), ('XRSP', 'SRSP')) - self.assertEqual(self.pa.encode('schulman', 4), ('XLMN', '')) - self.assertEqual(self.pa.encode('schwabach', 4), ('XPK', 'XFPK')) - self.assertEqual(self.pa.encode('schwarzach', 4), ('XRSK', 'XFRT')) - self.assertEqual(self.pa.encode('smith', 4), ('SM0', 'XMT')) - self.assertEqual(self.pa.encode('snider', 4), ('SNTR', 'XNTR')) - self.assertEqual(self.pa.encode('succeed', 4), ('SKST', '')) - self.assertEqual(self.pa.encode('sugarcane', 4), ('XKRK', 'SKRK')) - self.assertEqual(self.pa.encode('svobodka', 4), ('SFPT', '')) - self.assertEqual(self.pa.encode('tagliaro', 4), ('TKLR', 'TLR')) - self.assertEqual(self.pa.encode('thames', 4), ('TMS', '')) - self.assertEqual(self.pa.encode('theilheim', 4), ('0LM', 'TLM')) - self.assertEqual(self.pa.encode('thomas', 4), ('TMS', '')) - self.assertEqual(self.pa.encode('thumb', 4), ('0M', 'TM')) - self.assertEqual(self.pa.encode('tichner', 4), ('TXNR', 'TKNR')) - self.assertEqual(self.pa.encode('tough', 4), ('TF', '')) - self.assertEqual(self.pa.encode('umbrella', 4), ('AMPR', '')) - self.assertEqual(self.pa.encode('vilshofen', 4), ('FLXF', '')) - self.assertEqual(self.pa.encode('von schuller', 4), ('FNXL', '')) - self.assertEqual(self.pa.encode('wachtler', 4), ('AKTL', 'FKTL')) - self.assertEqual(self.pa.encode('wechsler', 4), ('AKSL', 'FKSL')) - self.assertEqual(self.pa.encode('weikersheim', 4), ('AKRS', 'FKRS')) - self.assertEqual(self.pa.encode('zhao', 4), ('J', '')) + self.assertEqual(self.pa_4.encode('Watts'), ('ATS', 'FTS')) + self.assertEqual(self.pa_4.encode('Wayne'), ('AN', 'FN')) + self.assertEqual(self.pa_4.encode('Webb'), ('AP', 'FP')) + self.assertEqual(self.pa_4.encode('Weber'), ('APR', 'FPR')) + self.assertEqual(self.pa_4.encode('Webster'), ('APST', 'FPST')) + self.assertEqual(self.pa_4.encode('Weed'), ('AT', 'FT')) + self.assertEqual(self.pa_4.encode('Weeks'), ('AKS', 'FKS')) + self.assertEqual(self.pa_4.encode('Wells'), ('ALS', 'FLS')) + self.assertEqual(self.pa_4.encode('Wenzell'), ('ANSL', 'FNTS')) + self.assertEqual(self.pa_4.encode('West'), ('AST', 'FST')) + self.assertEqual(self.pa_4.encode('Westbury'), ('ASTP', 'FSTP')) + self.assertEqual(self.pa_4.encode('Whatlocke'), ('ATLK', '')) + self.assertEqual(self.pa_4.encode('Wheeler'), ('ALR', '')) + self.assertEqual(self.pa_4.encode('Whiston'), ('ASTN', '')) + self.assertEqual(self.pa_4.encode('White'), ('AT', '')) + self.assertEqual(self.pa_4.encode('Whitman'), ('ATMN', '')) + self.assertEqual(self.pa_4.encode('Whiton'), ('ATN', '')) + self.assertEqual(self.pa_4.encode('Whitson'), ('ATSN', '')) + self.assertEqual(self.pa_4.encode('Wickes'), ('AKS', 'FKS')) + self.assertEqual(self.pa_4.encode('Wilbur'), ('ALPR', 'FLPR')) + self.assertEqual(self.pa_4.encode('Wilcotes'), ('ALKT', 'FLKT')) + self.assertEqual(self.pa_4.encode('Wilkinson'), ('ALKN', 'FLKN')) + self.assertEqual(self.pa_4.encode('Willets'), ('ALTS', 'FLTS')) + self.assertEqual(self.pa_4.encode('Willett'), ('ALT', 'FLT')) + self.assertEqual(self.pa_4.encode('Willey'), ('AL', 'FL')) + self.assertEqual(self.pa_4.encode('Williams'), ('ALMS', 'FLMS')) + self.assertEqual(self.pa_4.encode('Williston'), ('ALST', 'FLST')) + self.assertEqual(self.pa_4.encode('Wilson'), ('ALSN', 'FLSN')) + self.assertEqual(self.pa_4.encode('Wimes'), ('AMS', 'FMS')) + self.assertEqual(self.pa_4.encode('Winch'), ('ANX', 'FNK')) + self.assertEqual(self.pa_4.encode('Winegar'), ('ANKR', 'FNKR')) + self.assertEqual(self.pa_4.encode('Wing'), ('ANK', 'FNK')) + self.assertEqual(self.pa_4.encode('Winsley'), ('ANSL', 'FNSL')) + self.assertEqual(self.pa_4.encode('Winslow'), ('ANSL', 'FNSL')) + self.assertEqual(self.pa_4.encode('Winthrop'), ('AN0R', 'FNTR')) + self.assertEqual(self.pa_4.encode('Wise'), ('AS', 'FS')) + self.assertEqual(self.pa_4.encode('Wood'), ('AT', 'FT')) + self.assertEqual(self.pa_4.encode('Woodbridge'), ('ATPR', 'FTPR')) + self.assertEqual(self.pa_4.encode('Woodward'), ('ATRT', 'FTRT')) + self.assertEqual(self.pa_4.encode('Wooley'), ('AL', 'FL')) + self.assertEqual(self.pa_4.encode('Woolley'), ('AL', 'FL')) + self.assertEqual(self.pa_4.encode('Worth'), ('AR0', 'FRT')) + self.assertEqual(self.pa_4.encode('Worthen'), ('AR0N', 'FRTN')) + self.assertEqual(self.pa_4.encode('Worthley'), ('AR0L', 'FRTL')) + self.assertEqual(self.pa_4.encode('Wright'), ('RT', '')) + self.assertEqual(self.pa_4.encode('Wyer'), ('AR', 'FR')) + self.assertEqual(self.pa_4.encode('Wyere'), ('AR', 'FR')) + self.assertEqual(self.pa_4.encode('Wynkoop'), ('ANKP', 'FNKP')) + self.assertEqual(self.pa_4.encode('Yarnall'), ('ARNL', '')) + self.assertEqual(self.pa_4.encode('Yeoman'), ('AMN', '')) + self.assertEqual(self.pa_4.encode('Yorke'), ('ARK', '')) + self.assertEqual(self.pa_4.encode('Young'), ('ANK', '')) + self.assertEqual(self.pa_4.encode('ab Wennonwen'), ('APNN', '')) + self.assertEqual(self.pa_4.encode('ap Llewellyn'), ('APLL', '')) + self.assertEqual(self.pa_4.encode('ap Lorwerth'), ('APLR', '')) + self.assertEqual(self.pa_4.encode("d'Angouleme"), ('TNKL', '')) + self.assertEqual(self.pa_4.encode('de Audeham'), ('TTHM', '')) + self.assertEqual(self.pa_4.encode('de Bavant'), ('TPFN', '')) + self.assertEqual(self.pa_4.encode('de Beauchamp'), ('TPXM', 'TPKM')) + self.assertEqual(self.pa_4.encode('de Beaumont'), ('TPMN', '')) + self.assertEqual(self.pa_4.encode('de Bolbec'), ('TPLP', '')) + self.assertEqual(self.pa_4.encode('de Braiose'), ('TPRS', '')) + self.assertEqual(self.pa_4.encode('de Braose'), ('TPRS', '')) + self.assertEqual(self.pa_4.encode('de Briwere'), ('TPRR', '')) + self.assertEqual(self.pa_4.encode('de Cantelou'), ('TKNT', '')) + self.assertEqual(self.pa_4.encode('de Cherelton'), ('TXRL', 'TKRL')) + self.assertEqual(self.pa_4.encode('de Cherleton'), ('TXRL', 'TKRL')) + self.assertEqual(self.pa_4.encode('de Clare'), ('TKLR', '')) + self.assertEqual(self.pa_4.encode('de Claremont'), ('TKLR', '')) + self.assertEqual(self.pa_4.encode('de Clifford'), ('TKLF', '')) + self.assertEqual(self.pa_4.encode('de Colville'), ('TKLF', '')) + self.assertEqual(self.pa_4.encode('de Courtenay'), ('TKRT', '')) + self.assertEqual(self.pa_4.encode('de Fauconberg'), ('TFKN', '')) + self.assertEqual(self.pa_4.encode('de Forest'), ('TFRS', '')) + self.assertEqual(self.pa_4.encode('de Gai'), ('TK', '')) + self.assertEqual(self.pa_4.encode('de Grey'), ('TKR', '')) + self.assertEqual(self.pa_4.encode('de Guernons'), ('TKRN', '')) + self.assertEqual(self.pa_4.encode('de Haia'), ('T', '')) + self.assertEqual(self.pa_4.encode('de Harcourt'), ('TRKR', '')) + self.assertEqual(self.pa_4.encode('de Hastings'), ('TSTN', '')) + self.assertEqual(self.pa_4.encode('de Hoke'), ('TK', '')) + self.assertEqual(self.pa_4.encode('de Hooch'), ('TK', '')) + self.assertEqual(self.pa_4.encode('de Hugelville'), ('TJLF', 'TKLF')) + self.assertEqual(self.pa_4.encode('de Huntingdon'), ('TNTN', '')) + self.assertEqual(self.pa_4.encode('de Insula'), ('TNSL', '')) + self.assertEqual(self.pa_4.encode('de Keynes'), ('TKNS', '')) + self.assertEqual(self.pa_4.encode('de Lacy'), ('TLS', '')) + self.assertEqual(self.pa_4.encode('de Lexington'), ('TLKS', '')) + self.assertEqual(self.pa_4.encode('de Lusignan'), ('TLSN', 'TLSK')) + self.assertEqual(self.pa_4.encode('de Manvers'), ('TMNF', '')) + self.assertEqual(self.pa_4.encode('de Montagu'), ('TMNT', '')) + self.assertEqual(self.pa_4.encode('de Montault'), ('TMNT', '')) + self.assertEqual(self.pa_4.encode('de Montfort'), ('TMNT', '')) + self.assertEqual(self.pa_4.encode('de Mortimer'), ('TMRT', '')) + self.assertEqual(self.pa_4.encode('de Morville'), ('TMRF', '')) + self.assertEqual(self.pa_4.encode('de Morvois'), ('TMRF', '')) + self.assertEqual(self.pa_4.encode('de Neufmarche'), ('TNFM', '')) + self.assertEqual(self.pa_4.encode('de Odingsells'), ('TTNK', '')) + self.assertEqual(self.pa_4.encode('de Odyngsells'), ('TTNK', '')) + self.assertEqual(self.pa_4.encode('de Percy'), ('TPRS', '')) + self.assertEqual(self.pa_4.encode('de Pierrepont'), ('TPRP', '')) + self.assertEqual(self.pa_4.encode('de Plessetis'), ('TPLS', '')) + self.assertEqual(self.pa_4.encode('de Porhoet'), ('TPRT', '')) + self.assertEqual(self.pa_4.encode('de Prouz'), ('TPRS', '')) + self.assertEqual(self.pa_4.encode('de Quincy'), ('TKNS', '')) + self.assertEqual(self.pa_4.encode('de Ripellis'), ('TRPL', '')) + self.assertEqual(self.pa_4.encode('de Ros'), ('TRS', '')) + self.assertEqual(self.pa_4.encode('de Salisbury'), ('TSLS', '')) + self.assertEqual(self.pa_4.encode('de Sanford'), ('TSNF', '')) + self.assertEqual(self.pa_4.encode('de Somery'), ('TSMR', '')) + self.assertEqual(self.pa_4.encode('de St. Hilary'), ('TSTL', '')) + self.assertEqual(self.pa_4.encode('de St. Liz'), ('TSTL', '')) + self.assertEqual(self.pa_4.encode('de Sutton'), ('TSTN', '')) + self.assertEqual(self.pa_4.encode('de Toeni'), ('TTN', '')) + self.assertEqual(self.pa_4.encode('de Tony'), ('TTN', '')) + self.assertEqual(self.pa_4.encode('de Umfreville'), ('TMFR', '')) + self.assertEqual(self.pa_4.encode('de Valognes'), ('TFLN', 'TFLK')) + self.assertEqual(self.pa_4.encode('de Vaux'), ('TF', '')) + self.assertEqual(self.pa_4.encode('de Vere'), ('TFR', '')) + self.assertEqual(self.pa_4.encode('de Vermandois'), ('TFRM', '')) + self.assertEqual(self.pa_4.encode('de Vernon'), ('TFRN', '')) + self.assertEqual(self.pa_4.encode('de Vexin'), ('TFKS', '')) + self.assertEqual(self.pa_4.encode('de Vitre'), ('TFTR', '')) + self.assertEqual(self.pa_4.encode('de Wandesford'), ('TNTS', '')) + self.assertEqual(self.pa_4.encode('de Warenne'), ('TRN', '')) + self.assertEqual(self.pa_4.encode('de Westbury'), ('TSTP', '')) + self.assertEqual(self.pa_4.encode('di Saluzzo'), ('TSLS', 'TSLT')) + self.assertEqual(self.pa_4.encode('fitz Alan'), ('FTSL', '')) + self.assertEqual(self.pa_4.encode('fitz Geoffrey'), ('FTSJ', 'FTSK')) + self.assertEqual(self.pa_4.encode('fitz Herbert'), ('FTSR', '')) + self.assertEqual(self.pa_4.encode('fitz John'), ('FTSJ', '')) + self.assertEqual(self.pa_4.encode('fitz Patrick'), ('FTSP', '')) + self.assertEqual(self.pa_4.encode('fitz Payn'), ('FTSP', '')) + self.assertEqual(self.pa_4.encode('fitz Piers'), ('FTSP', '')) + self.assertEqual(self.pa_4.encode('fitz Randolph'), ('FTSR', '')) + self.assertEqual(self.pa_4.encode('fitz Richard'), ('FTSR', '')) + self.assertEqual(self.pa_4.encode('fitz Robert'), ('FTSR', '')) + self.assertEqual(self.pa_4.encode('fitz Roy'), ('FTSR', '')) + self.assertEqual(self.pa_4.encode('fitz Scrob'), ('FTSS', '')) + self.assertEqual(self.pa_4.encode('fitz Walter'), ('FTSL', '')) + self.assertEqual(self.pa_4.encode('fitz Warin'), ('FTSR', '')) + self.assertEqual(self.pa_4.encode('fitz Williams'), ('FTSL', '')) + self.assertEqual(self.pa_4.encode('la Zouche'), ('LSX', 'LSK')) + self.assertEqual(self.pa_4.encode('le Botiller'), ('LPTL', '')) + self.assertEqual(self.pa_4.encode('le Despenser'), ('LTSP', '')) + self.assertEqual(self.pa_4.encode('le deSpencer'), ('LTSP', '')) + self.assertEqual(self.pa_4.encode('of Allendale'), ('AFLN', '')) + self.assertEqual(self.pa_4.encode('of Angouleme'), ('AFNK', '')) + self.assertEqual(self.pa_4.encode('of Anjou'), ('AFNJ', '')) + self.assertEqual(self.pa_4.encode('of Aquitaine'), ('AFKT', '')) + self.assertEqual(self.pa_4.encode('of Aumale'), ('AFML', '')) + self.assertEqual(self.pa_4.encode('of Bavaria'), ('AFPF', '')) + self.assertEqual(self.pa_4.encode('of Boulogne'), ('AFPL', '')) + self.assertEqual(self.pa_4.encode('of Brittany'), ('AFPR', '')) + self.assertEqual(self.pa_4.encode('of Brittary'), ('AFPR', '')) + self.assertEqual(self.pa_4.encode('of Castile'), ('AFKS', '')) + self.assertEqual(self.pa_4.encode('of Chester'), ('AFXS', 'AFKS')) + self.assertEqual(self.pa_4.encode('of Clermont'), ('AFKL', '')) + self.assertEqual(self.pa_4.encode('of Cologne'), ('AFKL', '')) + self.assertEqual(self.pa_4.encode('of Dinan'), ('AFTN', '')) + self.assertEqual(self.pa_4.encode('of Dunbar'), ('AFTN', '')) + self.assertEqual(self.pa_4.encode('of England'), ('AFNK', '')) + self.assertEqual(self.pa_4.encode('of Essex'), ('AFSK', '')) + self.assertEqual(self.pa_4.encode('of Falaise'), ('AFFL', '')) + self.assertEqual(self.pa_4.encode('of Flanders'), ('AFFL', '')) + self.assertEqual(self.pa_4.encode('of Galloway'), ('AFKL', '')) + self.assertEqual(self.pa_4.encode('of Germany'), ('AFKR', 'AFJR')) + self.assertEqual(self.pa_4.encode('of Gloucester'), ('AFKL', '')) + self.assertEqual(self.pa_4.encode('of Heristal'), ('AFRS', '')) + self.assertEqual(self.pa_4.encode('of Hungary'), ('AFNK', '')) + self.assertEqual(self.pa_4.encode('of Huntington'), ('AFNT', '')) + self.assertEqual(self.pa_4.encode('of Kiev'), ('AFKF', '')) + self.assertEqual(self.pa_4.encode('of Kuno'), ('AFKN', '')) + self.assertEqual(self.pa_4.encode('of Landen'), ('AFLN', '')) + self.assertEqual(self.pa_4.encode('of Laon'), ('AFLN', '')) + self.assertEqual(self.pa_4.encode('of Leinster'), ('AFLN', '')) + self.assertEqual(self.pa_4.encode('of Lens'), ('AFLN', '')) + self.assertEqual(self.pa_4.encode('of Lorraine'), ('AFLR', '')) + self.assertEqual(self.pa_4.encode('of Louvain'), ('AFLF', '')) + self.assertEqual(self.pa_4.encode('of Mercia'), ('AFMR', '')) + self.assertEqual(self.pa_4.encode('of Metz'), ('AFMT', '')) + self.assertEqual(self.pa_4.encode('of Meulan'), ('AFML', '')) + self.assertEqual(self.pa_4.encode('of Nass'), ('AFNS', '')) + self.assertEqual(self.pa_4.encode('of Normandy'), ('AFNR', '')) + self.assertEqual(self.pa_4.encode('of Ohningen'), ('AFNN', '')) + self.assertEqual(self.pa_4.encode('of Orleans'), ('AFRL', '')) + self.assertEqual(self.pa_4.encode('of Poitou'), ('AFPT', '')) + self.assertEqual(self.pa_4.encode('of Polotzk'), ('AFPL', '')) + self.assertEqual(self.pa_4.encode('of Provence'), ('AFPR', '')) + self.assertEqual(self.pa_4.encode('of Ringelheim'), ('AFRN', '')) + self.assertEqual(self.pa_4.encode('of Salisbury'), ('AFSL', '')) + self.assertEqual(self.pa_4.encode('of Saxony'), ('AFSK', '')) + self.assertEqual(self.pa_4.encode('of Scotland'), ('AFSK', '')) + self.assertEqual(self.pa_4.encode('of Senlis'), ('AFSN', '')) + self.assertEqual(self.pa_4.encode('of Stafford'), ('AFST', '')) + self.assertEqual(self.pa_4.encode('of Swabia'), ('AFSP', '')) + self.assertEqual(self.pa_4.encode('of Tongres'), ('AFTN', '')) + self.assertEqual(self.pa_4.encode('of the Tributes'), ('AF0T', 'AFTT')) + self.assertEqual(self.pa_4.encode('unknown'), ('ANKN', '')) + self.assertEqual(self.pa_4.encode('van der Gouda'), ('FNTR', '')) + self.assertEqual(self.pa_4.encode('von Adenbaugh'), ('FNTN', '')) + self.assertEqual(self.pa_4.encode('ARCHITure'), ('ARKT', '')) + self.assertEqual(self.pa_4.encode('Arnoff'), ('ARNF', '')) + self.assertEqual(self.pa_4.encode('Arnow'), ('ARN', 'ARNF')) + self.assertEqual(self.pa_4.encode('DANGER'), ('TNJR', 'TNKR')) + self.assertEqual(self.pa_4.encode('Jankelowicz'), ('JNKL', 'ANKL')) + self.assertEqual(self.pa_4.encode('MANGER'), ('MNJR', 'MNKR')) + self.assertEqual(self.pa_4.encode('McClellan'), ('MKLL', '')) + self.assertEqual(self.pa_4.encode('McHugh'), ('MK', '')) + self.assertEqual(self.pa_4.encode('McLaughlin'), ('MKLF', '')) + self.assertEqual(self.pa_4.encode('ORCHEStra'), ('ARKS', '')) + self.assertEqual(self.pa_4.encode('ORCHID'), ('ARKT', '')) + self.assertEqual(self.pa_4.encode('Pierce'), ('PRS', '')) + self.assertEqual(self.pa_4.encode('RANGER'), ('RNJR', 'RNKR')) + self.assertEqual(self.pa_4.encode('Schlesinger'), ('XLSN', 'SLSN')) + self.assertEqual(self.pa_4.encode('Uomo'), ('AM', '')) + self.assertEqual(self.pa_4.encode('Vasserman'), ('FSRM', '')) + self.assertEqual(self.pa_4.encode('Wasserman'), ('ASRM', 'FSRM')) + self.assertEqual(self.pa_4.encode('Womo'), ('AM', 'FM')) + self.assertEqual(self.pa_4.encode('Yankelovich'), ('ANKL', '')) + self.assertEqual(self.pa_4.encode('accede'), ('AKST', '')) + self.assertEqual(self.pa_4.encode('accident'), ('AKST', '')) + self.assertEqual(self.pa_4.encode('adelsheim'), ('ATLS', '')) + self.assertEqual(self.pa_4.encode('aged'), ('AJT', 'AKT')) + self.assertEqual(self.pa_4.encode('ageless'), ('AJLS', 'AKLS')) + self.assertEqual(self.pa_4.encode('agency'), ('AJNS', 'AKNS')) + self.assertEqual(self.pa_4.encode('aghast'), ('AKST', '')) + self.assertEqual(self.pa_4.encode('agio'), ('AJ', 'AK')) + self.assertEqual(self.pa_4.encode('agrimony'), ('AKRM', '')) + self.assertEqual(self.pa_4.encode('album'), ('ALPM', '')) + self.assertEqual(self.pa_4.encode('alcmene'), ('ALKM', '')) + self.assertEqual(self.pa_4.encode('alehouse'), ('ALHS', '')) + self.assertEqual(self.pa_4.encode('antique'), ('ANTK', '')) + self.assertEqual(self.pa_4.encode('artois'), ('ART', 'ARTS')) + self.assertEqual(self.pa_4.encode('automation'), ('ATMX', '')) + self.assertEqual(self.pa_4.encode('bacchus'), ('PKS', '')) + self.assertEqual(self.pa_4.encode('bacci'), ('PX', '')) + self.assertEqual(self.pa_4.encode('bajador'), ('PJTR', 'PHTR')) + self.assertEqual(self.pa_4.encode('bellocchio'), ('PLX', '')) + self.assertEqual(self.pa_4.encode('bertucci'), ('PRTX', '')) + self.assertEqual(self.pa_4.encode('biaggi'), ('PJ', 'PK')) + self.assertEqual(self.pa_4.encode('bough'), ('P', '')) + self.assertEqual(self.pa_4.encode('breaux'), ('PR', '')) + self.assertEqual(self.pa_4.encode('broughton'), ('PRTN', '')) + self.assertEqual(self.pa_4.encode('cabrillo'), ('KPRL', 'KPR')) + self.assertEqual(self.pa_4.encode('caesar'), ('SSR', '')) + self.assertEqual(self.pa_4.encode('cagney'), ('KKN', '')) + self.assertEqual(self.pa_4.encode('campbell'), ('KMPL', '')) + self.assertEqual(self.pa_4.encode('carlisle'), ('KRLL', '')) + self.assertEqual(self.pa_4.encode('carlysle'), ('KRLL', '')) + self.assertEqual(self.pa_4.encode('chemistry'), ('KMST', '')) + self.assertEqual(self.pa_4.encode('chianti'), ('KNT', '')) + self.assertEqual(self.pa_4.encode('chorus'), ('KRS', '')) + self.assertEqual(self.pa_4.encode('cough'), ('KF', '')) + self.assertEqual(self.pa_4.encode('czerny'), ('SRN', 'XRN')) + self.assertEqual(self.pa_4.encode('deffenbacher'), ('TFNP', '')) + self.assertEqual(self.pa_4.encode('dumb'), ('TM', '')) + self.assertEqual(self.pa_4.encode('edgar'), ('ATKR', '')) + self.assertEqual(self.pa_4.encode('edge'), ('AJ', '')) + self.assertEqual(self.pa_4.encode('filipowicz'), ('FLPT', 'FLPF')) + self.assertEqual(self.pa_4.encode('focaccia'), ('FKX', '')) + self.assertEqual(self.pa_4.encode('gallegos'), ('KLKS', 'KKS')) + self.assertEqual(self.pa_4.encode('gambrelli'), ('KMPR', '')) + self.assertEqual(self.pa_4.encode('geithain'), ('K0N', 'JTN')) + self.assertEqual(self.pa_4.encode('ghiradelli'), ('JRTL', '')) + self.assertEqual(self.pa_4.encode('ghislane'), ('JLN', '')) + self.assertEqual(self.pa_4.encode('gough'), ('KF', '')) + self.assertEqual(self.pa_4.encode('hartheim'), ('HR0M', 'HRTM')) + self.assertEqual(self.pa_4.encode('heimsheim'), ('HMSM', '')) + self.assertEqual(self.pa_4.encode('hochmeier'), ('HKMR', '')) + self.assertEqual(self.pa_4.encode('hugh'), ('H', '')) + self.assertEqual(self.pa_4.encode('hunger'), ('HNKR', 'HNJR')) + self.assertEqual(self.pa_4.encode('hungry'), ('HNKR', '')) + self.assertEqual(self.pa_4.encode('island'), ('ALNT', '')) + self.assertEqual(self.pa_4.encode('isle'), ('AL', '')) + self.assertEqual(self.pa_4.encode('jose'), ('HS', '')) + self.assertEqual(self.pa_4.encode('laugh'), ('LF', '')) + self.assertEqual(self.pa_4.encode('mac caffrey'), ('MKFR', '')) + self.assertEqual(self.pa_4.encode('mac gregor'), ('MKRK', '')) + self.assertEqual(self.pa_4.encode('pegnitz'), ('PNTS', 'PKNT')) + self.assertEqual(self.pa_4.encode('piskowitz'), ('PSKT', 'PSKF')) + self.assertEqual(self.pa_4.encode('queen'), ('KN', '')) + self.assertEqual(self.pa_4.encode('raspberry'), ('RSPR', '')) + self.assertEqual(self.pa_4.encode('resnais'), ('RSN', 'RSNS')) + self.assertEqual(self.pa_4.encode('rogier'), ('RJ', 'RJR')) + self.assertEqual(self.pa_4.encode('rough'), ('RF', '')) + self.assertEqual(self.pa_4.encode('san jacinto'), ('SNHS', '')) + self.assertEqual(self.pa_4.encode('schenker'), ('XNKR', 'SKNK')) + self.assertEqual(self.pa_4.encode('schermerhorn'), ('XRMR', 'SKRM')) + self.assertEqual(self.pa_4.encode('schmidt'), ('XMT', 'SMT')) + self.assertEqual(self.pa_4.encode('schneider'), ('XNTR', 'SNTR')) + self.assertEqual(self.pa_4.encode('school'), ('SKL', '')) + self.assertEqual(self.pa_4.encode('schooner'), ('SKNR', '')) + self.assertEqual(self.pa_4.encode('schrozberg'), ('XRSP', 'SRSP')) + self.assertEqual(self.pa_4.encode('schulman'), ('XLMN', '')) + self.assertEqual(self.pa_4.encode('schwabach'), ('XPK', 'XFPK')) + self.assertEqual(self.pa_4.encode('schwarzach'), ('XRSK', 'XFRT')) + self.assertEqual(self.pa_4.encode('smith'), ('SM0', 'XMT')) + self.assertEqual(self.pa_4.encode('snider'), ('SNTR', 'XNTR')) + self.assertEqual(self.pa_4.encode('succeed'), ('SKST', '')) + self.assertEqual(self.pa_4.encode('sugarcane'), ('XKRK', 'SKRK')) + self.assertEqual(self.pa_4.encode('svobodka'), ('SFPT', '')) + self.assertEqual(self.pa_4.encode('tagliaro'), ('TKLR', 'TLR')) + self.assertEqual(self.pa_4.encode('thames'), ('TMS', '')) + self.assertEqual(self.pa_4.encode('theilheim'), ('0LM', 'TLM')) + self.assertEqual(self.pa_4.encode('thomas'), ('TMS', '')) + self.assertEqual(self.pa_4.encode('thumb'), ('0M', 'TM')) + self.assertEqual(self.pa_4.encode('tichner'), ('TXNR', 'TKNR')) + self.assertEqual(self.pa_4.encode('tough'), ('TF', '')) + self.assertEqual(self.pa_4.encode('umbrella'), ('AMPR', '')) + self.assertEqual(self.pa_4.encode('vilshofen'), ('FLXF', '')) + self.assertEqual(self.pa_4.encode('von schuller'), ('FNXL', '')) + self.assertEqual(self.pa_4.encode('wachtler'), ('AKTL', 'FKTL')) + self.assertEqual(self.pa_4.encode('wechsler'), ('AKSL', 'FKSL')) + self.assertEqual(self.pa_4.encode('weikersheim'), ('AKRS', 'FKRS')) + self.assertEqual(self.pa_4.encode('zhao'), ('J', '')) if __name__ == '__main__': diff --git a/tests/phonetic/test_phonetic_fuzzy_soundex.py b/tests/phonetic/test_phonetic_fuzzy_soundex.py index 874e2ff9e..d0ad83e5b 100644 --- a/tests/phonetic/test_phonetic_fuzzy_soundex.py +++ b/tests/phonetic/test_phonetic_fuzzy_soundex.py @@ -40,6 +40,7 @@ class FuzzySoundexTestCases(unittest.TestCase): """ pa = FuzzySoundex() + pa_4 = FuzzySoundex(4) def test_fuzzy_soundex(self): """Test abydos.phonetic.FuzzySoundex.""" @@ -50,18 +51,18 @@ def test_fuzzy_soundex(self): self.assertEqual(self.pa.encode('Christen'), 'K6935') # http://books.google.com/books?id=LZrT6eWf9NMC&lpg=PA76&ots=Tex3FqNwGP&dq=%22phonix%20algorithm%22&pg=PA75#v=onepage&q=%22phonix%20algorithm%22&f=false - self.assertEqual(self.pa.encode('peter', 4), 'P360') - self.assertEqual(self.pa.encode('pete', 4), 'P300') - self.assertEqual(self.pa.encode('pedro', 4), 'P360') - self.assertEqual(self.pa.encode('stephen', 4), 'S315') - self.assertEqual(self.pa.encode('steve', 4), 'S310') - self.assertEqual(self.pa.encode('smith', 4), 'S530') - self.assertEqual(self.pa.encode('smythe', 4), 'S530') - self.assertEqual(self.pa.encode('gail', 4), 'G400') - self.assertEqual(self.pa.encode('gayle', 4), 'G400') - self.assertEqual(self.pa.encode('christine', 4), 'K693') - self.assertEqual(self.pa.encode('christina', 4), 'K693') - self.assertEqual(self.pa.encode('kristina', 4), 'K693') + self.assertEqual(self.pa_4.encode('peter'), 'P360') + self.assertEqual(self.pa_4.encode('pete'), 'P300') + self.assertEqual(self.pa_4.encode('pedro'), 'P360') + self.assertEqual(self.pa_4.encode('stephen'), 'S315') + self.assertEqual(self.pa_4.encode('steve'), 'S310') + self.assertEqual(self.pa_4.encode('smith'), 'S530') + self.assertEqual(self.pa_4.encode('smythe'), 'S530') + self.assertEqual(self.pa_4.encode('gail'), 'G400') + self.assertEqual(self.pa_4.encode('gayle'), 'G400') + self.assertEqual(self.pa_4.encode('christine'), 'K693') + self.assertEqual(self.pa_4.encode('christina'), 'K693') + self.assertEqual(self.pa_4.encode('kristina'), 'K693') # etc. (for code coverage) self.assertEqual(self.pa.encode('Wight'), 'W3000') @@ -79,26 +80,33 @@ def test_fuzzy_soundex(self): # max_length bounds tests self.assertEqual( - self.pa.encode('Niall', max_length=-1), - 'N4000000000000000000000000000000000000000000000000' - + '00000000000000', + FuzzySoundex(max_length=-1).encode('Niall'), + 'N400000000000000000000000000000000000000000000000000000000000000', ) - self.assertEqual(self.pa.encode('Niall', max_length=0), 'N400') + self.assertEqual(FuzzySoundex(max_length=0).encode('Niall'), 'N400') # zero_pad tests self.assertEqual( - self.pa.encode('Niall', max_length=-1, zero_pad=False), 'N4' + FuzzySoundex(max_length=-1, zero_pad=False).encode('Niall'), 'N4' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=False), 'N4' + FuzzySoundex(max_length=0, zero_pad=False).encode('Niall'), 'N4' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=True), 'N400' + FuzzySoundex(max_length=0, zero_pad=True).encode('Niall'), 'N400' ) - self.assertEqual(self.pa.encode('', max_length=4, zero_pad=False), '0') self.assertEqual( - self.pa.encode('', max_length=4, zero_pad=True), '0000' + FuzzySoundex(max_length=4, zero_pad=False).encode(''), '0' ) + self.assertEqual( + FuzzySoundex(max_length=4, zero_pad=True).encode(''), '0000' + ) + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('pete'), 'PT') + self.assertEqual(self.pa.encode_alpha('pedro'), 'PTR') + self.assertEqual(self.pa.encode_alpha('stephen'), 'STPN') + self.assertEqual(self.pa.encode_alpha('steve'), 'STP') # Test wrapper self.assertEqual(fuzzy_soundex('Kristen'), 'K6935') diff --git a/tests/phonetic/test_phonetic_haase.py b/tests/phonetic/test_phonetic_haase.py index f58d896da..aafbf5464 100644 --- a/tests/phonetic/test_phonetic_haase.py +++ b/tests/phonetic/test_phonetic_haase.py @@ -56,9 +56,7 @@ def test_haase_phonetik(self): # coverage completion self.assertEqual(self.pa.encode('Häschen'), ('9896', '9496')) - self.assertEqual( - self.pa.encode('Häschen', primary_only=True), ('9896',) - ) + self.assertEqual(Haase(primary_only=True).encode('Häschen'), ('9896',)) self.assertEqual(self.pa.encode('Eichörnchen'), ('94976496',)) self.assertEqual(self.pa.encode('Hexe'), ('9489',)) self.assertEqual(self.pa.encode('Chemie'), ('4969', '8969')) @@ -80,6 +78,12 @@ def test_haase_phonetik(self): self.assertEqual(self.pa.encode('kickx'), ('4948',)) self.assertEqual(self.pa.encode('sanctionen'), ('896829696',)) + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Niveau'), ('NAFA',)) + self.assertEqual(self.pa.encode_alpha('Korb'), ('KARP', 'KARF')) + self.assertEqual(self.pa.encode_alpha('Heino'), ('ANA', 'ANAF')) + self.assertEqual(self.pa.encode_alpha('Nekka'), ('NAKA', 'NAKAR')) + # Test wrapper self.assertEqual(haase_phonetik('Häschen'), ('9896', '9496')) diff --git a/tests/phonetic/test_phonetic_henry_early.py b/tests/phonetic/test_phonetic_henry_early.py index 8ca80c83f..deb6949e3 100644 --- a/tests/phonetic/test_phonetic_henry_early.py +++ b/tests/phonetic/test_phonetic_henry_early.py @@ -103,7 +103,9 @@ def test_henry_early(self): self.assertEqual(self.pa.encode('Renault'), 'RN') self.assertEqual(self.pa.encode('Czech'), 'CSK') self.assertEqual(self.pa.encode('Imran'), 'ER') - self.assertEqual(self.pa.encode('Christopher', max_length=-1), 'KRXF') + self.assertEqual( + HenryEarly(max_length=-1).encode('Christopher'), 'KRXF' + ) # Test wrapper self.assertEqual(henry_early('Gausselin'), 'GSL') diff --git a/tests/phonetic/test_phonetic_lein.py b/tests/phonetic/test_phonetic_lein.py index f2584d8fd..5d080295a 100644 --- a/tests/phonetic/test_phonetic_lein.py +++ b/tests/phonetic/test_phonetic_lein.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 by Christopher C. Little. +# Copyright 2014-2019 by Christopher C. Little. # This file is part of Abydos. # # Abydos is free software: you can redistribute it and/or modify @@ -18,7 +18,7 @@ """abydos.tests.phonetic.test_phonetic_lein. -This module contains unit tests for abydos.phonetic.Lein +This module contains unit tests for abydos.phonetic.LEIN """ from __future__ import ( @@ -30,19 +30,20 @@ import unittest -from abydos.phonetic import Lein, lein +from abydos.phonetic import LEIN, lein class LeinTestCases(unittest.TestCase): - """Test Lein functions. + """Test LEIN functions. - test cases for abydos.phonetic.Lein + test cases for abydos.phonetic.LEIN """ - pa = Lein() + pa = LEIN() + pa_n0 = LEIN(zero_pad=False) def test_lein(self): - """Test abydos.phonetic.Lein.""" + """Test abydos.phonetic.LEIN.""" self.assertEqual(self.pa.encode(''), '0000') # https://naldc.nal.usda.gov/download/27833/PDF @@ -113,10 +114,14 @@ def test_lein(self): self.assertEqual(self.pa.encode('Lüdenscheidt'), 'L125') # Coverage - self.assertEqual( - self.pa.encode('Lüdenscheidt', zero_pad=False), 'L125' - ) - self.assertEqual(self.pa.encode('Smith', zero_pad=False), 'S21') + self.assertEqual(self.pa_n0.encode('Lüdenscheidt'), 'L125') + self.assertEqual(self.pa_n0.encode('Smith'), 'S21') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Deveaux'), 'DPK') + self.assertEqual(self.pa.encode_alpha('Devies'), 'DPK') + self.assertEqual(self.pa.encode_alpha('Sand'), 'SNT') + self.assertEqual(self.pa.encode_alpha('Sandau'), 'SNT') # Test wrapper self.assertEqual(lein('Dubose'), 'D450') diff --git a/tests/phonetic/test_phonetic_meta_soundex.py b/tests/phonetic/test_phonetic_meta_soundex.py new file mode 100644 index 000000000..a296175ab --- /dev/null +++ b/tests/phonetic/test_phonetic_meta_soundex.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.phonetic.test_phonetic_meta_soundex. + +This module contains unit tests for abydos.phonetic.MetaSoundex +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.phonetic import MetaSoundex, metasoundex + + +class MetaSoundexTestCases(unittest.TestCase): + """Test MetaSoundex functions. + + test cases for abydos.phonetic.MetaSoundex + """ + + pa = MetaSoundex() + pa_en = MetaSoundex(lang='en') + pa_es = MetaSoundex(lang='es') + + def test_meta_soundex(self): + """Test abydos.phonetic.MetaSoundex.""" + # Base cases + self.assertEqual(self.pa.encode(''), '0000') + self.assertEqual(self.pa_en.encode(''), '0000') + self.assertEqual(self.pa_es.encode(''), '') + + # Top 10 Anglo surnames in US + self.assertEqual(self.pa_en.encode('Smith'), '4500') + self.assertEqual(self.pa_en.encode('Johnson'), '1525') + self.assertEqual(self.pa_en.encode('Williams'), '7452') + self.assertEqual(self.pa_en.encode('Brown'), '7650') + self.assertEqual(self.pa_en.encode('Jones'), '1520') + self.assertEqual(self.pa_en.encode('Miller'), '6460') + self.assertEqual(self.pa_en.encode('Davis'), '3120') + self.assertEqual(self.pa_en.encode('Wilson'), '7425') + self.assertEqual(self.pa_en.encode('Anderson'), '0536') + self.assertEqual(self.pa_en.encode('Thomas'), '6200') + + self.assertEqual(self.pa_es.encode('Smith'), '4632') + self.assertEqual(self.pa_es.encode('Johnson'), '82646') + self.assertEqual(self.pa_es.encode('Williams'), '564') + self.assertEqual(self.pa_es.encode('Brown'), '196') + self.assertEqual(self.pa_es.encode('Jones'), '864') + self.assertEqual(self.pa_es.encode('Miller'), '659') + self.assertEqual(self.pa_es.encode('Davis'), '314') + self.assertEqual(self.pa_es.encode('Wilson'), '546') + self.assertEqual(self.pa_es.encode('Anderson'), '63946') + self.assertEqual(self.pa_es.encode('Thomas'), '364') + + # Top 10 Mexican surnames + self.assertEqual(self.pa_en.encode('Hernández'), '5653') + self.assertEqual(self.pa_en.encode('García'), '5620') + self.assertEqual(self.pa_en.encode('Lòpez'), '8120') + self.assertEqual(self.pa_en.encode('Martìnez'), '6635') + self.assertEqual(self.pa_en.encode('Rodrìguez'), '9362') + self.assertEqual(self.pa_en.encode('González'), '5524') + self.assertEqual(self.pa_en.encode('Pérez'), '7620') + self.assertEqual(self.pa_en.encode('Sánchez'), '4520') + self.assertEqual(self.pa_en.encode('Gómez'), '5520') + self.assertEqual(self.pa_en.encode('Flores'), '7462') + + self.assertEqual(self.pa_es.encode('Hernández'), '96634') + self.assertEqual(self.pa_es.encode('García'), '894') + self.assertEqual(self.pa_es.encode('Lòpez'), '504') + self.assertEqual(self.pa_es.encode('Martìnez'), '69364') + self.assertEqual(self.pa_es.encode('Rodrìguez'), '93984') + self.assertEqual(self.pa_es.encode('González'), '86454') + self.assertEqual(self.pa_es.encode('Pérez'), '094') + self.assertEqual(self.pa_es.encode('Sánchez'), '4644') + self.assertEqual(self.pa_es.encode('Gómez'), '864') + self.assertEqual(self.pa_es.encode('Flores'), '2594') + + # encode_alpha + self.assertEqual(self.pa_en.encode_alpha('Smith'), 'SN') + self.assertEqual(self.pa_en.encode_alpha('Johnson'), 'JNKN') + self.assertEqual(self.pa_es.encode_alpha('Hernández'), 'RNNTS') + self.assertEqual(self.pa_es.encode_alpha('García'), 'GRS') + + # Test wrapper + self.assertEqual(metasoundex('Smith', lang='en'), '4500') + self.assertEqual(metasoundex('Hernández', lang='es'), '96634') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/phonetic/test_phonetic_metaphone.py b/tests/phonetic/test_phonetic_metaphone.py index 680eda969..2fddfb83b 100644 --- a/tests/phonetic/test_phonetic_metaphone.py +++ b/tests/phonetic/test_phonetic_metaphone.py @@ -40,6 +40,7 @@ class MetaphoneTestCases(unittest.TestCase): """ pa = Metaphone() + pa4 = Metaphone(4) def test_metaphone(self): """Test abydos.phonetic.Metaphone.""" @@ -47,40 +48,40 @@ def test_metaphone(self): self.assertEqual(self.pa.encode('...'), '') # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html - self.assertEqual(self.pa.encode('Fishpool', 4), 'FXPL') - self.assertEqual(self.pa.encode('Fishpoole', 4), 'FXPL') - self.assertEqual(self.pa.encode('Gellately', 4), 'JLTL') - self.assertEqual(self.pa.encode('Gelletly', 4), 'JLTL') - self.assertEqual(self.pa.encode('Lowers', 4), 'LWRS') - self.assertEqual(self.pa.encode('Lowerson', 4), 'LWRS') - self.assertEqual(self.pa.encode('Mallabar', 4), 'MLBR') - self.assertEqual(self.pa.encode('Melbert', 4), 'MLBR') - self.assertEqual(self.pa.encode('Melbourn', 4), 'MLBR') - self.assertEqual(self.pa.encode('Melbourne', 4), 'MLBR') - self.assertEqual(self.pa.encode('Melburg', 4), 'MLBR') - self.assertEqual(self.pa.encode('Melbury', 4), 'MLBR') - self.assertEqual(self.pa.encode('Milberry', 4), 'MLBR') - self.assertEqual(self.pa.encode('Milborn', 4), 'MLBR') - self.assertEqual(self.pa.encode('Milbourn', 4), 'MLBR') - self.assertEqual(self.pa.encode('Milbourne', 4), 'MLBR') - self.assertEqual(self.pa.encode('Milburn', 4), 'MLBR') - self.assertEqual(self.pa.encode('Milburne', 4), 'MLBR') - self.assertEqual(self.pa.encode('Millberg', 4), 'MLBR') - self.assertEqual(self.pa.encode('Mulberry', 4), 'MLBR') - self.assertEqual(self.pa.encode('Mulbery', 4), 'MLBR') - self.assertEqual(self.pa.encode('Mulbry', 4), 'MLBR') - self.assertEqual(self.pa.encode('Saipy', 4), 'SP') - self.assertEqual(self.pa.encode('Sapey', 4), 'SP') - self.assertEqual(self.pa.encode('Sapp', 4), 'SP') - self.assertEqual(self.pa.encode('Sappy', 4), 'SP') - self.assertEqual(self.pa.encode('Sepey', 4), 'SP') - self.assertEqual(self.pa.encode('Seppey', 4), 'SP') - self.assertEqual(self.pa.encode('Sopp', 4), 'SP') - self.assertEqual(self.pa.encode('Zoppie', 4), 'SP') - self.assertEqual(self.pa.encode('Zoppo', 4), 'SP') - self.assertEqual(self.pa.encode('Zupa', 4), 'SP') - self.assertEqual(self.pa.encode('Zupo', 4), 'SP') - self.assertEqual(self.pa.encode('Zuppa', 4), 'SP') + self.assertEqual(self.pa4.encode('Fishpool'), 'FXPL') + self.assertEqual(self.pa4.encode('Fishpoole'), 'FXPL') + self.assertEqual(self.pa4.encode('Gellately'), 'JLTL') + self.assertEqual(self.pa4.encode('Gelletly'), 'JLTL') + self.assertEqual(self.pa4.encode('Lowers'), 'LWRS') + self.assertEqual(self.pa4.encode('Lowerson'), 'LWRS') + self.assertEqual(self.pa4.encode('Mallabar'), 'MLBR') + self.assertEqual(self.pa4.encode('Melbert'), 'MLBR') + self.assertEqual(self.pa4.encode('Melbourn'), 'MLBR') + self.assertEqual(self.pa4.encode('Melbourne'), 'MLBR') + self.assertEqual(self.pa4.encode('Melburg'), 'MLBR') + self.assertEqual(self.pa4.encode('Melbury'), 'MLBR') + self.assertEqual(self.pa4.encode('Milberry'), 'MLBR') + self.assertEqual(self.pa4.encode('Milborn'), 'MLBR') + self.assertEqual(self.pa4.encode('Milbourn'), 'MLBR') + self.assertEqual(self.pa4.encode('Milbourne'), 'MLBR') + self.assertEqual(self.pa4.encode('Milburn'), 'MLBR') + self.assertEqual(self.pa4.encode('Milburne'), 'MLBR') + self.assertEqual(self.pa4.encode('Millberg'), 'MLBR') + self.assertEqual(self.pa4.encode('Mulberry'), 'MLBR') + self.assertEqual(self.pa4.encode('Mulbery'), 'MLBR') + self.assertEqual(self.pa4.encode('Mulbry'), 'MLBR') + self.assertEqual(self.pa4.encode('Saipy'), 'SP') + self.assertEqual(self.pa4.encode('Sapey'), 'SP') + self.assertEqual(self.pa4.encode('Sapp'), 'SP') + self.assertEqual(self.pa4.encode('Sappy'), 'SP') + self.assertEqual(self.pa4.encode('Sepey'), 'SP') + self.assertEqual(self.pa4.encode('Seppey'), 'SP') + self.assertEqual(self.pa4.encode('Sopp'), 'SP') + self.assertEqual(self.pa4.encode('Zoppie'), 'SP') + self.assertEqual(self.pa4.encode('Zoppo'), 'SP') + self.assertEqual(self.pa4.encode('Zupa'), 'SP') + self.assertEqual(self.pa4.encode('Zupo'), 'SP') + self.assertEqual(self.pa4.encode('Zuppa'), 'SP') # assorted tests to complete code coverage self.assertEqual(self.pa.encode('Xavier'), 'SFR') @@ -91,14 +92,32 @@ def test_metaphone(self): self.assertEqual(self.pa.encode('Horatio'), 'HRX') self.assertEqual(self.pa.encode('Ignatio'), 'IKNX') self.assertEqual(self.pa.encode('Lucretia'), 'LKRX') + self.assertEqual(self.pa.encode('Wright'), 'RKT') + self.assertEqual(self.pa.encode('White'), 'WT') + self.assertEqual(self.pa.encode('Black'), 'BLK') + self.assertEqual(self.pa.encode('Chance'), 'XNS') + self.assertEqual(self.pa.encode('Dgengo'), 'JJNK') + self.assertEqual(self.pa.encode('Ghost'), 'ST') + self.assertEqual(self.pa.encode('Qing'), 'KNK') + self.assertEqual(self.pa.encode('Asia'), 'AX') + self.assertEqual(self.pa.encode('Ax'), 'AKS') + self.assertEqual(self.pa.encode('Thegn'), '0N') + self.assertEqual(self.pa.encode('acknowledged'), 'AKNLJT') + self.assertEqual(self.pa.encode('awkward'), 'AKWRT') + self.assertEqual(self.pa.encode('admitted'), 'ATMTT') + self.assertEqual(self.pa.encode('dahl'), 'TL') + self.assertEqual(self.pa.encode('autobiography'), 'ATBKRF') + self.assertEqual(self.pa.encode('exaggerate'), 'EKSKRT') + self.assertEqual(self.pa.encode('pitch'), 'PX') + self.assertEqual(self.pa.encode('chracter'), 'KRKTR') # assorted tests to complete branch coverage self.assertEqual(self.pa.encode('Lamb'), 'LM') self.assertEqual(self.pa.encode('science'), 'SNS') # max_length bounds tests - self.assertEqual(self.pa.encode('Niall', max_length=-1), 'NL') - self.assertEqual(self.pa.encode('Niall', max_length=0), 'NL') + self.assertEqual(Metaphone(max_length=-1).encode('Niall'), 'NL') + self.assertEqual(Metaphone(max_length=0).encode('Niall'), 'NL') # Test wrapper self.assertEqual(metaphone('Xavier'), 'SFR') diff --git a/tests/phonetic/test_phonetic_metasoundex.py b/tests/phonetic/test_phonetic_metasoundex.py deleted file mode 100644 index ed0bb979a..000000000 --- a/tests/phonetic/test_phonetic_metasoundex.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2018 by Christopher C. Little. -# This file is part of Abydos. -# -# Abydos is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Abydos is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Abydos. If not, see . - -"""abydos.tests.phonetic.test_phonetic_metasoundex. - -This module contains unit tests for abydos.phonetic.MetaSoundex -""" - -from __future__ import ( - absolute_import, - division, - print_function, - unicode_literals, -) - -import unittest - -from abydos.phonetic import MetaSoundex, metasoundex - - -class MetaSoundexTestCases(unittest.TestCase): - """Test MetaSoundex functions. - - test cases for abydos.phonetic.MetaSoundex - """ - - pa = MetaSoundex() - - def test_metasoundex(self): - """Test abydos.phonetic.MetaSoundex.""" - # Base cases - self.assertEqual(self.pa.encode(''), '0000') - self.assertEqual(self.pa.encode('', lang='en'), '0000') - self.assertEqual(self.pa.encode('', lang='es'), '') - - # Top 10 Anglo surnames in US - self.assertEqual(self.pa.encode('Smith', lang='en'), '4500') - self.assertEqual(self.pa.encode('Johnson', lang='en'), '1525') - self.assertEqual(self.pa.encode('Williams', lang='en'), '7452') - self.assertEqual(self.pa.encode('Brown', lang='en'), '7650') - self.assertEqual(self.pa.encode('Jones', lang='en'), '1520') - self.assertEqual(self.pa.encode('Miller', lang='en'), '6460') - self.assertEqual(self.pa.encode('Davis', lang='en'), '3120') - self.assertEqual(self.pa.encode('Wilson', lang='en'), '7425') - self.assertEqual(self.pa.encode('Anderson', lang='en'), '0536') - self.assertEqual(self.pa.encode('Thomas', lang='en'), '6200') - - self.assertEqual(self.pa.encode('Smith', lang='es'), '4632') - self.assertEqual(self.pa.encode('Johnson', lang='es'), '82646') - self.assertEqual(self.pa.encode('Williams', lang='es'), '564') - self.assertEqual(self.pa.encode('Brown', lang='es'), '196') - self.assertEqual(self.pa.encode('Jones', lang='es'), '864') - self.assertEqual(self.pa.encode('Miller', lang='es'), '659') - self.assertEqual(self.pa.encode('Davis', lang='es'), '314') - self.assertEqual(self.pa.encode('Wilson', lang='es'), '546') - self.assertEqual(self.pa.encode('Anderson', lang='es'), '63946') - self.assertEqual(self.pa.encode('Thomas', lang='es'), '364') - - # Top 10 Mexican surnames - self.assertEqual(self.pa.encode('Hernández', lang='en'), '5653') - self.assertEqual(self.pa.encode('García', lang='en'), '5620') - self.assertEqual(self.pa.encode('Lòpez', lang='en'), '8120') - self.assertEqual(self.pa.encode('Martìnez', lang='en'), '6635') - self.assertEqual(self.pa.encode('Rodrìguez', lang='en'), '9362') - self.assertEqual(self.pa.encode('González', lang='en'), '5524') - self.assertEqual(self.pa.encode('Pérez', lang='en'), '7620') - self.assertEqual(self.pa.encode('Sánchez', lang='en'), '4520') - self.assertEqual(self.pa.encode('Gómez', lang='en'), '5520') - self.assertEqual(self.pa.encode('Flores', lang='en'), '7462') - - self.assertEqual(self.pa.encode('Hernández', lang='es'), '96634') - self.assertEqual(self.pa.encode('García', lang='es'), '894') - self.assertEqual(self.pa.encode('Lòpez', lang='es'), '504') - self.assertEqual(self.pa.encode('Martìnez', lang='es'), '69364') - self.assertEqual(self.pa.encode('Rodrìguez', lang='es'), '93984') - self.assertEqual(self.pa.encode('González', lang='es'), '86454') - self.assertEqual(self.pa.encode('Pérez', lang='es'), '094') - self.assertEqual(self.pa.encode('Sánchez', lang='es'), '4644') - self.assertEqual(self.pa.encode('Gómez', lang='es'), '864') - self.assertEqual(self.pa.encode('Flores', lang='es'), '2594') - - # Test wrapper - self.assertEqual(metasoundex('Smith', lang='en'), '4500') - self.assertEqual(metasoundex('Hernández', lang='es'), '96634') - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/phonetic/test_phonetic_nrl.py b/tests/phonetic/test_phonetic_nrl.py index 5453751f7..928292c24 100644 --- a/tests/phonetic/test_phonetic_nrl.py +++ b/tests/phonetic/test_phonetic_nrl.py @@ -158,7 +158,7 @@ def test_nrl(self): self.assertEqual(self.pa.encode('doctor'), 'dAAktER') self.assertEqual(self.pa.encode('provide'), 'prAHvAYd') self.assertEqual(self.pa.encode('thus'), 'DHAHs') - self.assertEqual(self.pa.encode('won\'t'), 'wOWnt') + self.assertEqual(self.pa.encode("won't"), 'wOWnt') self.assertEqual(self.pa.encode('cook'), 'kUHk') self.assertEqual(self.pa.encode('bones'), 'bOWnz') self.assertEqual(self.pa.encode('tail'), 'tEYl') @@ -166,7 +166,7 @@ def test_nrl(self): self.assertEqual(self.pa.encode('modern'), 'mOWdERn') self.assertEqual(self.pa.encode('compound'), 'kAAmpAWnd') self.assertEqual(self.pa.encode('mine'), 'mAYn') - self.assertEqual(self.pa.encode('wasn\'t'), 'wAAzAXnt') + self.assertEqual(self.pa.encode("wasn't"), 'wAAzAXnt') self.assertEqual(self.pa.encode('fit'), 'fIHt') self.assertEqual(self.pa.encode('addition'), 'AEddIHSHAXn') self.assertEqual(self.pa.encode('belong'), 'bIHlAONG') @@ -249,18 +249,18 @@ def test_nrl(self): self.assertEqual(self.pa.encode('sharp'), 'SHAArp') self.assertEqual(self.pa.encode('company'), 'kAAmpAEnIH') self.assertEqual(self.pa.encode('radio'), 'rEYdIHOW') - self.assertEqual(self.pa.encode('we\'ll'), 'wEHl') + self.assertEqual(self.pa.encode("we'll"), 'wEHl') self.assertEqual(self.pa.encode('action'), 'AEkSHAXn') self.assertEqual(self.pa.encode('capital'), 'kAEpIHtAXl') self.assertEqual(self.pa.encode('factories'), 'fAEktAOrIYs') self.assertEqual(self.pa.encode('settled'), 'sEHttld') self.assertEqual(self.pa.encode('yellow'), 'yEHlOW') - self.assertEqual(self.pa.encode('isn\'t'), 'IHzAXnt') + self.assertEqual(self.pa.encode("isn't"), 'IHzAXnt') self.assertEqual(self.pa.encode('southern'), 'sAWDHERn') self.assertEqual(self.pa.encode('truck'), 'trAHk') self.assertEqual(self.pa.encode('train'), 'trEYn') self.assertEqual(self.pa.encode('printed'), 'prIHntIHd') - self.assertEqual(self.pa.encode('wouldn\'t'), 'wUHdnt') + self.assertEqual(self.pa.encode("wouldn't"), 'wUHdnt') self.assertEqual(self.pa.encode('ahead'), 'EYhEHd') self.assertEqual(self.pa.encode('chance'), 'CHAEns') self.assertEqual(self.pa.encode('born'), 'bAOrn') @@ -320,7 +320,7 @@ def test_nrl(self): self.assertEqual(self.pa.encode('difficult'), 'dIHffIHkAHlt') self.assertEqual(self.pa.encode('match'), 'mAEtCH') self.assertEqual(self.pa.encode('win'), 'wIHn') - self.assertEqual(self.pa.encode('doesn\'t'), 'dAHznt') + self.assertEqual(self.pa.encode("doesn't"), 'dAHznt') self.assertEqual(self.pa.encode('steel'), 'stIYl') self.assertEqual(self.pa.encode('total'), 'tAAtAXl') self.assertEqual(self.pa.encode('deal'), 'dIYl') diff --git a/tests/phonetic/test_phonetic_nysiis.py b/tests/phonetic/test_phonetic_nysiis.py index f40905bfa..9acf0cccb 100644 --- a/tests/phonetic/test_phonetic_nysiis.py +++ b/tests/phonetic/test_phonetic_nysiis.py @@ -40,14 +40,17 @@ class NysiisTestCases(unittest.TestCase): """ pa = NYSIIS() + pa_20 = NYSIIS(max_length=20) + pa_8mod = NYSIIS(max_length=8, modified=True) + pa_mod = NYSIIS(modified=True) def test_nysiis(self): """Test abydos.phonetic.NYSIIS.""" self.assertEqual(self.pa.encode(''), '') # http://coryodaniel.com/index.php/2009/12/30/ruby-nysiis-implementation/ - self.assertEqual(self.pa.encode('O\'Daniel'), 'ODANAL') - self.assertEqual(self.pa.encode('O\'Donnel'), 'ODANAL') + self.assertEqual(self.pa.encode("O'Daniel"), 'ODANAL') + self.assertEqual(self.pa.encode("O'Donnel"), 'ODANAL') self.assertEqual(self.pa.encode('Cory'), 'CARY') self.assertEqual(self.pa.encode('Corey'), 'CARY') self.assertEqual(self.pa.encode('Kory'), 'CARY') @@ -61,9 +64,7 @@ def test_nysiis(self): self.assertEqual(self.pa.encode('Dugall'), 'DAGAL') self.assertEqual(self.pa.encode('Dugall'), 'DAGAL') self.assertEqual(self.pa.encode('Glinde'), 'GLAND') - self.assertEqual( - self.pa.encode('Plumridge', max_length=20), 'PLANRADG' - ) + self.assertEqual(self.pa_20.encode('Plumridge'), 'PLANRADG') self.assertEqual(self.pa.encode('Chinnick'), 'CANAC') self.assertEqual(self.pa.encode('Chinnock'), 'CANAC') self.assertEqual(self.pa.encode('Chinnock'), 'CANAC') @@ -79,8 +80,8 @@ def test_nysiis(self): self.assertEqual(self.pa.encode('Sunnex'), 'SANAX') self.assertEqual(self.pa.encode('Sunnucks'), 'SANAC') self.assertEqual(self.pa.encode('Sunock'), 'SANAC') - self.assertEqual(self.pa.encode('Webberley', max_length=20), 'WABARLY') - self.assertEqual(self.pa.encode('Wibberley', max_length=20), 'WABARLY') + self.assertEqual(self.pa_20.encode('Webberley'), 'WABARLY') + self.assertEqual(self.pa_20.encode('Wibberley'), 'WABARLY') # etc. (for code coverage) self.assertEqual(self.pa.encode('Alpharades'), 'ALFARA') @@ -99,254 +100,111 @@ def test_nysiis(self): self.assertEqual(self.pa.encode('Iraq'), 'IRAG') # max_length bounds tests - self.assertEqual(self.pa.encode('Niall', max_length=-1), 'NAL') - self.assertEqual(self.pa.encode('Niall', max_length=0), 'NAL') + self.assertEqual(NYSIIS(max_length=-1).encode('Niall'), 'NAL') + self.assertEqual(NYSIIS(max_length=0).encode('Niall'), 'NAL') # Test wrapper - self.assertEqual(nysiis('O\'Daniel'), 'ODANAL') + self.assertEqual(nysiis("O'Daniel"), 'ODANAL') def test_modified_nysiis(self): """Test abydos.phonetic.NYSIIS (modified version).""" - self.assertEqual(self.pa.encode('', max_length=-1, modified=True), '') + self.assertEqual(NYSIIS(max_length=-1, modified=True).encode(''), '') # https://naldc.nal.usda.gov/download/27833/PDF # Some of these were... wrong... and have been corrected - self.assertEqual( - self.pa.encode('Daves', max_length=8, modified=True), 'DAV' - ) - self.assertEqual( - self.pa.encode('Davies', max_length=8, modified=True), 'DAVY' - ) - self.assertEqual( - self.pa.encode('Devies', max_length=8, modified=True), 'DAFY' - ) - self.assertEqual( - self.pa.encode('Divish', max_length=8, modified=True), 'DAVAS' - ) - self.assertEqual( - self.pa.encode('Dove', max_length=8, modified=True), 'DAV' - ) - self.assertEqual( - self.pa.encode('Devese', max_length=8, modified=True), 'DAFAS' - ) - self.assertEqual( - self.pa.encode('Devies', max_length=8, modified=True), 'DAFY' - ) - self.assertEqual( - self.pa.encode('Devos', max_length=8, modified=True), 'DAF' - ) - - self.assertEqual( - self.pa.encode('Schmit', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Schmitt', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Schmitz', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Schmoutz', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Schnitt', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Smit', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Smite', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Smits', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Smoot', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Smuts', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Sneath', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Smyth', max_length=8, modified=True), 'SNAT' - ) - self.assertEqual( - self.pa.encode('Smithy', max_length=8, modified=True), 'SNATY' - ) - self.assertEqual( - self.pa.encode('Smithey', max_length=8, modified=True), 'SNATY' - ) + self.assertEqual(self.pa_8mod.encode('Daves'), 'DAV') + self.assertEqual(self.pa_8mod.encode('Davies'), 'DAVY') + self.assertEqual(self.pa_8mod.encode('Devies'), 'DAFY') + self.assertEqual(self.pa_8mod.encode('Divish'), 'DAVAS') + self.assertEqual(self.pa_8mod.encode('Dove'), 'DAV') + self.assertEqual(self.pa_8mod.encode('Devese'), 'DAFAS') + self.assertEqual(self.pa_8mod.encode('Devies'), 'DAFY') + self.assertEqual(self.pa_8mod.encode('Devos'), 'DAF') + + self.assertEqual(self.pa_8mod.encode('Schmit'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Schmitt'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Schmitz'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Schmoutz'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Schnitt'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Smit'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Smite'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Smits'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Smoot'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Smuts'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Sneath'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Smyth'), 'SNAT') + self.assertEqual(self.pa_8mod.encode('Smithy'), 'SNATY') + self.assertEqual(self.pa_8mod.encode('Smithey'), 'SNATY') # http://www.dropby.com/NYSIISTextStrings.html # Some of these have been altered since the above uses a different set # of modifications. - self.assertEqual( - self.pa.encode('Edwards', max_length=8, modified=True), 'EDWAD' - ) - self.assertEqual( - self.pa.encode('Perez', max_length=8, modified=True), 'PAR' - ) - self.assertEqual( - self.pa.encode('Macintosh', max_length=8, modified=True), 'MCANTAS' - ) - self.assertEqual( - self.pa.encode('Phillipson', max_length=8, modified=True), - 'FALAPSAN', - ) - self.assertEqual( - self.pa.encode('Haddix', max_length=8, modified=True), 'HADAC' - ) - self.assertEqual( - self.pa.encode('Essex', max_length=8, modified=True), 'ESAC' - ) - self.assertEqual( - self.pa.encode('Moye', max_length=8, modified=True), 'MY' - ) - self.assertEqual( - self.pa.encode('McKee', max_length=8, modified=True), 'MCY' - ) - self.assertEqual( - self.pa.encode('Mackie', max_length=8, modified=True), 'MCY' - ) - self.assertEqual( - self.pa.encode('Heitschmidt', max_length=8, modified=True), - 'HATSNAD', - ) - self.assertEqual( - self.pa.encode('Bart', max_length=8, modified=True), 'BAD' - ) - self.assertEqual( - self.pa.encode('Hurd', max_length=8, modified=True), 'HAD' - ) - self.assertEqual( - self.pa.encode('Hunt', max_length=8, modified=True), 'HAN' - ) - self.assertEqual( - self.pa.encode('Westerlund', max_length=8, modified=True), - 'WASTARLA', - ) - self.assertEqual( - self.pa.encode('Evers', max_length=8, modified=True), 'EVAR' - ) - self.assertEqual( - self.pa.encode('Devito', max_length=8, modified=True), 'DAFAT' - ) - self.assertEqual( - self.pa.encode('Rawson', max_length=8, modified=True), 'RASAN' - ) - self.assertEqual( - self.pa.encode('Shoulders', max_length=8, modified=True), 'SALDAR' - ) - self.assertEqual( - self.pa.encode('Leighton', max_length=8, modified=True), 'LATAN' - ) - self.assertEqual( - self.pa.encode('Wooldridge', max_length=8, modified=True), - 'WALDRAG', - ) - self.assertEqual( - self.pa.encode('Oliphant', max_length=8, modified=True), 'OLAFAN' - ) - self.assertEqual( - self.pa.encode('Hatchett', max_length=8, modified=True), 'HATCAT' - ) - self.assertEqual( - self.pa.encode('McKnight', max_length=8, modified=True), 'MCNAT' - ) - self.assertEqual( - self.pa.encode('Rickert', max_length=8, modified=True), 'RACAD' - ) - self.assertEqual( - self.pa.encode('Bowman', max_length=8, modified=True), 'BANAN' - ) - self.assertEqual( - self.pa.encode('Vasquez', max_length=8, modified=True), 'VASG' - ) - self.assertEqual( - self.pa.encode('Bashaw', max_length=8, modified=True), 'BAS' - ) - self.assertEqual( - self.pa.encode('Schoenhoeft', max_length=8, modified=True), - 'SANAFT', - ) - self.assertEqual( - self.pa.encode('Heywood', max_length=8, modified=True), 'HAD' - ) - self.assertEqual( - self.pa.encode('Hayman', max_length=8, modified=True), 'HANAN' - ) - self.assertEqual( - self.pa.encode('Seawright', max_length=8, modified=True), 'SARAT' - ) - self.assertEqual( - self.pa.encode('Kratzer', max_length=8, modified=True), 'CRATSAR' - ) - self.assertEqual( - self.pa.encode('Canaday', max_length=8, modified=True), 'CANADY' - ) - self.assertEqual( - self.pa.encode('Crepeau', max_length=8, modified=True), 'CRAP' - ) + self.assertEqual(self.pa_8mod.encode('Edwards'), 'EDWAD') + self.assertEqual(self.pa_8mod.encode('Perez'), 'PAR') + self.assertEqual(self.pa_8mod.encode('Macintosh'), 'MCANTAS') + self.assertEqual(self.pa_8mod.encode('Phillipson'), 'FALAPSAN') + self.assertEqual(self.pa_8mod.encode('Haddix'), 'HADAC') + self.assertEqual(self.pa_8mod.encode('Essex'), 'ESAC') + self.assertEqual(self.pa_8mod.encode('Moye'), 'MY') + self.assertEqual(self.pa_8mod.encode('McKee'), 'MCY') + self.assertEqual(self.pa_8mod.encode('Mackie'), 'MCY') + self.assertEqual(self.pa_8mod.encode('Heitschmidt'), 'HATSNAD') + self.assertEqual(self.pa_8mod.encode('Bart'), 'BAD') + self.assertEqual(self.pa_8mod.encode('Hurd'), 'HAD') + self.assertEqual(self.pa_8mod.encode('Hunt'), 'HAN') + self.assertEqual(self.pa_8mod.encode('Westerlund'), 'WASTARLA') + self.assertEqual(self.pa_8mod.encode('Evers'), 'EVAR') + self.assertEqual(self.pa_8mod.encode('Devito'), 'DAFAT') + self.assertEqual(self.pa_8mod.encode('Rawson'), 'RASAN') + self.assertEqual(self.pa_8mod.encode('Shoulders'), 'SALDAR') + self.assertEqual(self.pa_8mod.encode('Leighton'), 'LATAN') + self.assertEqual(self.pa_8mod.encode('Wooldridge'), 'WALDRAG') + self.assertEqual(self.pa_8mod.encode('Oliphant'), 'OLAFAN') + self.assertEqual(self.pa_8mod.encode('Hatchett'), 'HATCAT') + self.assertEqual(self.pa_8mod.encode('McKnight'), 'MCNAT') + self.assertEqual(self.pa_8mod.encode('Rickert'), 'RACAD') + self.assertEqual(self.pa_8mod.encode('Bowman'), 'BANAN') + self.assertEqual(self.pa_8mod.encode('Vasquez'), 'VASG') + self.assertEqual(self.pa_8mod.encode('Bashaw'), 'BAS') + self.assertEqual(self.pa_8mod.encode('Schoenhoeft'), 'SANAFT') + self.assertEqual(self.pa_8mod.encode('Heywood'), 'HAD') + self.assertEqual(self.pa_8mod.encode('Hayman'), 'HANAN') + self.assertEqual(self.pa_8mod.encode('Seawright'), 'SARAT') + self.assertEqual(self.pa_8mod.encode('Kratzer'), 'CRATSAR') + self.assertEqual(self.pa_8mod.encode('Canaday'), 'CANADY') + self.assertEqual(self.pa_8mod.encode('Crepeau'), 'CRAP') # Additional tests from @Yomguithereal's talisman # https://github.com/Yomguithereal/talisman/blob/master/test/phonetics/nysiis.js - self.assertEqual( - self.pa.encode('Andrew', max_length=8, modified=True), 'ANDR' - ) - self.assertEqual( - self.pa.encode('Robertson', max_length=8, modified=True), - 'RABARTSA', - ) - self.assertEqual( - self.pa.encode('Nolan', max_length=8, modified=True), 'NALAN' - ) - self.assertEqual( - self.pa.encode('Louis XVI', max_length=8, modified=True), 'LASXV' - ) - self.assertEqual( - self.pa.encode('Case', max_length=8, modified=True), 'CAS' - ) - self.assertEqual( - self.pa.encode('Mclaughlin', max_length=8, modified=True), - 'MCLAGLAN', - ) - self.assertEqual( - self.pa.encode('Awale', max_length=8, modified=True), 'AL' - ) - self.assertEqual( - self.pa.encode('Aegir', max_length=8, modified=True), 'AGAR' - ) - self.assertEqual( - self.pa.encode('Lundgren', max_length=8, modified=True), 'LANGRAN' - ) - self.assertEqual( - self.pa.encode('Philbert', max_length=8, modified=True), 'FALBAD' - ) - self.assertEqual( - self.pa.encode('Harry', max_length=8, modified=True), 'HARY' - ) - self.assertEqual( - self.pa.encode('Mackenzie', max_length=8, modified=True), 'MCANSY' - ) + self.assertEqual(self.pa_8mod.encode('Andrew'), 'ANDR') + self.assertEqual(self.pa_8mod.encode('Robertson'), 'RABARTSA') + self.assertEqual(self.pa_8mod.encode('Nolan'), 'NALAN') + self.assertEqual(self.pa_8mod.encode('Louis XVI'), 'LASXV') + self.assertEqual(self.pa_8mod.encode('Case'), 'CAS') + self.assertEqual(self.pa_8mod.encode('Mclaughlin'), 'MCLAGLAN') + self.assertEqual(self.pa_8mod.encode('Awale'), 'AL') + self.assertEqual(self.pa_8mod.encode('Aegir'), 'AGAR') + self.assertEqual(self.pa_8mod.encode('Lundgren'), 'LANGRAN') + self.assertEqual(self.pa_8mod.encode('Philbert'), 'FALBAD') + self.assertEqual(self.pa_8mod.encode('Harry'), 'HARY') + self.assertEqual(self.pa_8mod.encode('Mackenzie'), 'MCANSY') # max_length bounds tests self.assertEqual( - self.pa.encode('Niall', max_length=-1, modified=True), 'NAL' + NYSIIS(max_length=-1, modified=True).encode('Niall'), 'NAL' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, modified=True), 'NAL' + NYSIIS(max_length=0, modified=True).encode('Niall'), 'NAL' ) # coverage - self.assertEqual(self.pa.encode('Sam Jr.', modified=True), 'ERROR') - self.assertEqual(self.pa.encode('John Sr.', modified=True), 'ERROR') - self.assertEqual(self.pa.encode('Wright', modified=True), 'RAT') - self.assertEqual(self.pa.encode('Rhodes', modified=True), 'RAD') - self.assertEqual(self.pa.encode('Dgagoda', modified=True), 'GAGAD') - self.assertEqual(self.pa.encode('Bosch', modified=True), 'BAS') - self.assertEqual(self.pa.encode('Schrader', modified=True), 'SRADAR') + self.assertEqual(self.pa_mod.encode('Sam Jr.'), 'ERROR') + self.assertEqual(self.pa_mod.encode('John Sr.'), 'ERROR') + self.assertEqual(self.pa_mod.encode('Wright'), 'RAT') + self.assertEqual(self.pa_mod.encode('Rhodes'), 'RAD') + self.assertEqual(self.pa_mod.encode('Dgagoda'), 'GAGAD') + self.assertEqual(self.pa_mod.encode('Bosch'), 'BAS') + self.assertEqual(self.pa_mod.encode('Schrader'), 'SRADAR') # Test wrapper self.assertEqual( diff --git a/tests/phonetic/test_phonetic_onca.py b/tests/phonetic/test_phonetic_onca.py index 260c71d87..1686459a1 100644 --- a/tests/phonetic/test_phonetic_onca.py +++ b/tests/phonetic/test_phonetic_onca.py @@ -52,6 +52,11 @@ def test_onca(self): self.assertEqual(self.pa.encode('HORTON'), 'H635') self.assertEqual(self.pa.encode('HOUGHTON'), 'H235') + # encode_alpha + self.assertEqual(self.pa.encode_alpha('HALL'), 'HL') + self.assertEqual(self.pa.encode_alpha('SMITH'), 'SNT') + self.assertEqual(self.pa.encode_alpha('HOUGHTON'), 'HKTN') + # Test wrapper self.assertEqual(onca('HALL'), 'H400') diff --git a/tests/phonetic/test_phonetic_phonet.py b/tests/phonetic/test_phonetic_phonet.py index c1076d017..656b44abc 100644 --- a/tests/phonetic/test_phonetic_phonet.py +++ b/tests/phonetic/test_phonetic_phonet.py @@ -43,59 +43,63 @@ class PhonetTestCases(unittest.TestCase): """ pa = Phonet() + pa_1 = Phonet(1) + pa_2 = Phonet(2) + pa_1none = Phonet(1, 'none') + pa_2none = Phonet(2, 'none') def test_phonet_german(self): """Test abydos.phonetic.Phonet (German).""" self.assertEqual(self.pa.encode(''), '') # https://code.google.com/p/phonet4java/source/browse/trunk/src/test/java/com/googlecode/phonet4java/Phonet1Test.java - self.assertEqual(self.pa.encode('', 1), '') - self.assertEqual(self.pa.encode('Zedlitz', 1), 'ZETLIZ') - self.assertEqual(self.pa.encode('Bremerhaven', 1), 'BREMAHAFN') - self.assertEqual(self.pa.encode('Hamburger Hafen', 1), 'HAMBURGA HAFN') - self.assertEqual(self.pa.encode('Jesper', 1), 'IESPA') - self.assertEqual(self.pa.encode('elisabeth', 1), 'ELISABET') - self.assertEqual(self.pa.encode('elisabet', 1), 'ELISABET') - self.assertEqual(self.pa.encode('Ziegler', 1), 'ZIKLA') - self.assertEqual(self.pa.encode('Scherer', 1), 'SHERA') - self.assertEqual(self.pa.encode('Bartels', 1), 'BARTLS') - self.assertEqual(self.pa.encode('Jansen', 1), 'IANSN') - self.assertEqual(self.pa.encode('Sievers', 1), 'SIWAS') - self.assertEqual(self.pa.encode('Michels', 1), 'MICHLS') - self.assertEqual(self.pa.encode('Ewers', 1), 'EWERS') - self.assertEqual(self.pa.encode('Evers', 1), 'EWERS') - self.assertEqual(self.pa.encode('Wessels', 1), 'WESLS') - self.assertEqual(self.pa.encode('Gottschalk', 1), 'GOSHALK') - self.assertEqual(self.pa.encode('Brückmann', 1), 'BRÜKMAN') - self.assertEqual(self.pa.encode('Blechschmidt', 1), 'BLECHSHMIT') - self.assertEqual(self.pa.encode('Kolodziej', 1), 'KOLOTZI') - self.assertEqual(self.pa.encode('Krauße', 1), 'KRAUSE') - self.assertEqual(self.pa.encode('Cachel', 1), 'KESHL') - - self.assertEqual(self.pa.encode('', 2), '') - self.assertEqual(self.pa.encode('Zedlitz', 2), 'ZETLIZ') - self.assertEqual(self.pa.encode('Bremerhaven', 2), 'BRENAFN') - self.assertEqual(self.pa.encode('Schönberg', 2), 'ZÖNBAK') - self.assertEqual(self.pa.encode('Hamburger Hafen', 2), 'ANBURKA AFN') - self.assertEqual(self.pa.encode('Ziegler', 2), 'ZIKLA') - self.assertEqual(self.pa.encode('Scherer', 2), 'ZERA') - self.assertEqual(self.pa.encode('Jansen', 2), 'IANZN') - self.assertEqual(self.pa.encode('Eberhardt', 2), 'EBART') - self.assertEqual(self.pa.encode('Gottschalk', 2), 'KUZALK') - self.assertEqual(self.pa.encode('Brückmann', 2), 'BRIKNAN') - self.assertEqual(self.pa.encode('Blechschmidt', 2), 'BLEKZNIT') - self.assertEqual(self.pa.encode('Kolodziej', 2), 'KULUTZI') - self.assertEqual(self.pa.encode('Krauße', 2), 'KRAUZE') + self.assertEqual(self.pa_1.encode(''), '') + self.assertEqual(self.pa_1.encode('Zedlitz'), 'ZETLIZ') + self.assertEqual(self.pa_1.encode('Bremerhaven'), 'BREMAHAFN') + self.assertEqual(self.pa_1.encode('Hamburger Hafen'), 'HAMBURGA HAFN') + self.assertEqual(self.pa_1.encode('Jesper'), 'IESPA') + self.assertEqual(self.pa_1.encode('elisabeth'), 'ELISABET') + self.assertEqual(self.pa_1.encode('elisabet'), 'ELISABET') + self.assertEqual(self.pa_1.encode('Ziegler'), 'ZIKLA') + self.assertEqual(self.pa_1.encode('Scherer'), 'SHERA') + self.assertEqual(self.pa_1.encode('Bartels'), 'BARTLS') + self.assertEqual(self.pa_1.encode('Jansen'), 'IANSN') + self.assertEqual(self.pa_1.encode('Sievers'), 'SIWAS') + self.assertEqual(self.pa_1.encode('Michels'), 'MICHLS') + self.assertEqual(self.pa_1.encode('Ewers'), 'EWERS') + self.assertEqual(self.pa_1.encode('Evers'), 'EWERS') + self.assertEqual(self.pa_1.encode('Wessels'), 'WESLS') + self.assertEqual(self.pa_1.encode('Gottschalk'), 'GOSHALK') + self.assertEqual(self.pa_1.encode('Brückmann'), 'BRÜKMAN') + self.assertEqual(self.pa_1.encode('Blechschmidt'), 'BLECHSHMIT') + self.assertEqual(self.pa_1.encode('Kolodziej'), 'KOLOTZI') + self.assertEqual(self.pa_1.encode('Krauße'), 'KRAUSE') + self.assertEqual(self.pa_1.encode('Cachel'), 'KESHL') + + self.assertEqual(self.pa_2.encode(''), '') + self.assertEqual(self.pa_2.encode('Zedlitz'), 'ZETLIZ') + self.assertEqual(self.pa_2.encode('Bremerhaven'), 'BRENAFN') + self.assertEqual(self.pa_2.encode('Schönberg'), 'ZÖNBAK') + self.assertEqual(self.pa_2.encode('Hamburger Hafen'), 'ANBURKA AFN') + self.assertEqual(self.pa_2.encode('Ziegler'), 'ZIKLA') + self.assertEqual(self.pa_2.encode('Scherer'), 'ZERA') + self.assertEqual(self.pa_2.encode('Jansen'), 'IANZN') + self.assertEqual(self.pa_2.encode('Eberhardt'), 'EBART') + self.assertEqual(self.pa_2.encode('Gottschalk'), 'KUZALK') + self.assertEqual(self.pa_2.encode('Brückmann'), 'BRIKNAN') + self.assertEqual(self.pa_2.encode('Blechschmidt'), 'BLEKZNIT') + self.assertEqual(self.pa_2.encode('Kolodziej'), 'KULUTZI') + self.assertEqual(self.pa_2.encode('Krauße'), 'KRAUZE') # etc. (for code coverage) - self.assertEqual(self.pa.encode('Jesper', 1), 'IESPA') - self.assertEqual(self.pa.encode('Glacéhandschuh', 1), 'GLAZANSHU') - self.assertEqual(self.pa.encode('Blechschmidt', 1), 'BLECHSHMIT') - self.assertEqual(self.pa.encode('Burgdorf', 1), 'BURKDORF') - self.assertEqual(self.pa.encode('Holzschuh', 1), 'HOLSHU') - self.assertEqual(self.pa.encode('Aachen', 1), 'ACHN') + self.assertEqual(self.pa_1.encode('Jesper'), 'IESPA') + self.assertEqual(self.pa_1.encode('Glacéhandschuh'), 'GLAZANSHU') + self.assertEqual(self.pa_1.encode('Blechschmidt'), 'BLECHSHMIT') + self.assertEqual(self.pa_1.encode('Burgdorf'), 'BURKDORF') + self.assertEqual(self.pa_1.encode('Holzschuh'), 'HOLSHU') + self.assertEqual(self.pa_1.encode('Aachen'), 'ACHN') self.assertEqual( - self.pa.encode('Abendspaziergang', 1), 'ABENTSPAZIRGANK' + self.pa_1.encode('Abendspaziergang'), 'ABENTSPAZIRGANK' ) # Test wrapper @@ -103,26 +107,22 @@ def test_phonet_german(self): def test_phonet_nolang(self): """Test abydos.phonetic.Phonet (no language).""" - self.assertEqual(self.pa.encode('', lang='none'), '') + self.assertEqual(Phonet(lang='none').encode(''), '') # https://code.google.com/p/phonet4java/source/browse/trunk/src/test/java/com/googlecode/phonet4java/Phonet1Test.java - self.assertEqual(self.pa.encode('', 1, 'none'), '') - self.assertEqual(self.pa.encode('Zedlitz', 1, 'none'), 'ZEDLITZ') - self.assertEqual( - self.pa.encode('Bremerhaven', 1, 'none'), 'BREMERHAVEN' - ) - self.assertEqual(self.pa.encode('Schönberg', 2, 'none'), 'SCHOENBERG') - self.assertEqual(self.pa.encode('Brückmann', 1, 'none'), 'BRUECKMAN') - self.assertEqual(self.pa.encode('Krauße', 1, 'none'), 'KRAUSE') - - self.assertEqual(self.pa.encode('', 2, 'none'), '') - self.assertEqual(self.pa.encode('Zedlitz', 2, 'none'), 'ZEDLITZ') - self.assertEqual( - self.pa.encode('Bremerhaven', 2, 'none'), 'BREMERHAVEN' - ) - self.assertEqual(self.pa.encode('Schönberg', 2, 'none'), 'SCHOENBERG') - self.assertEqual(self.pa.encode('Brückmann', 2, 'none'), 'BRUECKMAN') - self.assertEqual(self.pa.encode('Krauße', 2, 'none'), 'KRAUSE') + self.assertEqual(self.pa_1none.encode(''), '') + self.assertEqual(self.pa_1none.encode('Zedlitz'), 'ZEDLITZ') + self.assertEqual(self.pa_1none.encode('Bremerhaven'), 'BREMERHAVEN') + self.assertEqual(self.pa_2none.encode('Schönberg'), 'SCHOENBERG') + self.assertEqual(self.pa_1none.encode('Brückmann'), 'BRUECKMAN') + self.assertEqual(self.pa_1none.encode('Krauße'), 'KRAUSE') + + self.assertEqual(self.pa_2none.encode(''), '') + self.assertEqual(self.pa_2none.encode('Zedlitz'), 'ZEDLITZ') + self.assertEqual(self.pa_2none.encode('Bremerhaven'), 'BREMERHAVEN') + self.assertEqual(self.pa_2none.encode('Schönberg'), 'SCHOENBERG') + self.assertEqual(self.pa_2none.encode('Brückmann'), 'BRUECKMAN') + self.assertEqual(self.pa_2none.encode('Krauße'), 'KRAUSE') # Test wrapper self.assertEqual(phonet('Bremerhaven', 1, 'none'), 'BREMERHAVEN') @@ -141,8 +141,8 @@ def test_phonet_nachnamen(self): # so let's just randomly select about 100 for testing if len(nn_line) >= 3 and _one_in(100): (term, ph1, ph2) = nn_line - self.assertEqual(self.pa.encode(term, 1), ph1) - self.assertEqual(self.pa.encode(term, 2), ph2) + self.assertEqual(self.pa_1.encode(term), ph1) + self.assertEqual(self.pa_2.encode(term), ph2) def test_phonet_ngerman(self): """Test abydos.phonetic.Phonet (ngerman set).""" @@ -158,8 +158,8 @@ def test_phonet_ngerman(self): # so let's just randomly select about 30 for testing if len(ng_line) >= 3 and _one_in(10000): (term, ph1, ph2) = ng_line - self.assertEqual(self.pa.encode(term, 1), ph1) - self.assertEqual(self.pa.encode(term, 2), ph2) + self.assertEqual(self.pa_1.encode(term), ph1) + self.assertEqual(self.pa_2.encode(term), ph2) if __name__ == '__main__': diff --git a/tests/phonetic/test_phonetic_phonetic_spanish.py b/tests/phonetic/test_phonetic_phonetic_spanish.py index 4778d4259..7fcb0fcfb 100644 --- a/tests/phonetic/test_phonetic_phonetic_spanish.py +++ b/tests/phonetic/test_phonetic_phonetic_spanish.py @@ -55,7 +55,15 @@ def test_phonetic_spanish(self): self.assertEqual(self.pa.encode('Sielo'), '45') # Test to maximize coverage - self.assertEqual(self.pa.encode('Giraldo', max_length=2), '89') + self.assertEqual(PhoneticSpanish(max_length=2).encode('Giraldo'), '89') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Giraldo'), 'GRLT') + self.assertEqual(self.pa.encode_alpha('Jiraldo'), 'GRLT') + self.assertEqual(self.pa.encode_alpha('Halla'), 'FL') + self.assertEqual(self.pa.encode_alpha('Haya'), 'FL') + self.assertEqual(self.pa.encode_alpha('Cielo'), 'SL') + self.assertEqual(self.pa.encode_alpha('Sielo'), 'SL') # Test wrapper self.assertEqual(phonetic_spanish('Giraldo'), '8953') diff --git a/tests/phonetic/test_phonetic_phonex.py b/tests/phonetic/test_phonetic_phonex.py index b7ce1012b..30ca97751 100644 --- a/tests/phonetic/test_phonetic_phonex.py +++ b/tests/phonetic/test_phonetic_phonex.py @@ -82,27 +82,29 @@ def test_phonex(self): # max_length bounds tests self.assertEqual( - self.pa.encode('Niall', max_length=-1), - 'N4000000000000000000000000000000000000000000000000' - + '00000000000000', + Phonex(max_length=-1).encode('Niall'), + 'N400000000000000000000000000000000000000000000000000000000000000', ) - self.assertEqual(self.pa.encode('Niall', max_length=0), 'N400') + self.assertEqual(Phonex(max_length=0).encode('Niall'), 'N400') # zero_pad tests self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=False), 'N4' + Phonex(max_length=0, zero_pad=False).encode('Niall'), 'N4' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=False), 'N4' + Phonex(max_length=0, zero_pad=True).encode('Niall'), 'N400' ) + self.assertEqual(Phonex(max_length=4, zero_pad=False).encode(''), '0') self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=True), 'N400' - ) - self.assertEqual(self.pa.encode('', max_length=4, zero_pad=False), '0') - self.assertEqual( - self.pa.encode('', max_length=4, zero_pad=True), '0000' + Phonex(max_length=4, zero_pad=True).encode(''), '0000' ) + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Ewell'), 'AL') + self.assertEqual(self.pa.encode_alpha('Filp'), 'FP') + self.assertEqual(self.pa.encode_alpha('Heames'), 'AN') + self.assertEqual(self.pa.encode_alpha('Kneves'), 'NP') + # Test wrapper self.assertEqual(phonex('Ewell'), 'A400') diff --git a/tests/phonetic/test_phonetic_phonix.py b/tests/phonetic/test_phonetic_phonix.py index edf846bb7..376b00339 100644 --- a/tests/phonetic/test_phonetic_phonix.py +++ b/tests/phonetic/test_phonetic_phonix.py @@ -62,7 +62,7 @@ def test_phonix(self): # testcases from Wais Module self.assertEqual(self.pa.encode('computer'), 'K513') self.assertEqual(self.pa.encode('computers'), 'K513') - self.assertEqual(self.pa.encode('computers', 5), 'K5138') + self.assertEqual(Phonix(5).encode('computers'), 'K5138') self.assertEqual(self.pa.encode('pfeifer'), 'F700') self.assertEqual(self.pa.encode('pfeiffer'), 'F700') self.assertEqual(self.pa.encode('knight'), 'N300') @@ -102,25 +102,31 @@ def test_phonix(self): # max_length bounds tests self.assertEqual( - self.pa.encode('Niall', max_length=-1), 'N4' + '0' * 62 + Phonix(max_length=-1).encode('Niall'), 'N4' + '0' * 62 ) - self.assertEqual(self.pa.encode('Niall', max_length=0), 'N400') + self.assertEqual(Phonix(max_length=0).encode('Niall'), 'N400') # zero_pad tests self.assertEqual( - self.pa.encode('Niall', max_length=-1, zero_pad=False), 'N4' + Phonix(max_length=-1, zero_pad=False).encode('Niall'), 'N4' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=False), 'N4' + Phonix(max_length=0, zero_pad=False).encode('Niall'), 'N4' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=True), 'N400' + Phonix(max_length=0, zero_pad=True).encode('Niall'), 'N400' ) - self.assertEqual(self.pa.encode('', max_length=4, zero_pad=False), '0') + self.assertEqual(Phonix(max_length=4, zero_pad=False).encode(''), '0') self.assertEqual( - self.pa.encode('', max_length=4, zero_pad=True), '0000' + Phonix(max_length=4, zero_pad=True).encode(''), '0000' ) + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Müller'), 'ML') + self.assertEqual(self.pa.encode_alpha('schneider'), 'SNT') + self.assertEqual(self.pa.encode_alpha('fischer'), 'FS') + self.assertEqual(self.pa.encode_alpha('weber'), 'WP') + # Test wrapper self.assertEqual(phonix('Müller'), 'M400') diff --git a/tests/phonetic/test_phonetic_pshp_soundex_first.py b/tests/phonetic/test_phonetic_pshp_soundex_first.py index ff6298e8b..a793da6d1 100644 --- a/tests/phonetic/test_phonetic_pshp_soundex_first.py +++ b/tests/phonetic/test_phonetic_pshp_soundex_first.py @@ -40,6 +40,8 @@ class PSHPSoundexTestCases(unittest.TestCase): """ pa = PSHPSoundexFirst() + pa_german = PSHPSoundexFirst(german=True) + pa_unl = PSHPSoundexFirst(max_length=-1) def test_pshp_soundex_first(self): """Test abydos.phonetic.PSHPSoundexFirst.""" @@ -60,13 +62,20 @@ def test_pshp_soundex_first(self): self.assertEqual(self.pa.encode('Knabe'), 'N100') self.assertEqual(self.pa.encode('Phil'), 'F400') self.assertEqual(self.pa.encode('Wieland'), 'V400') - self.assertEqual(self.pa.encode('Wayne', german=True), 'V500') - self.assertEqual(self.pa.encode('Christopher', max_length=-1), 'K5') - self.assertEqual( - self.pa.encode('Asdaananndsjsjasd', max_length=-1), 'A23553223' - ) + self.assertEqual(self.pa_german.encode('Wayne'), 'V500') + self.assertEqual(self.pa_unl.encode('Christopher'), 'K5') + self.assertEqual(self.pa_unl.encode('Asdaananndsjsjasd'), 'A23553223') self.assertEqual(self.pa.encode('Asdaananndsjsjasd'), 'A235') + # encode_alpha + self.assertEqual(self.pa.encode_alpha('JAMES'), 'JN') + self.assertEqual(self.pa.encode_alpha('JOHN'), 'JN') + self.assertEqual(self.pa.encode_alpha('PAT'), 'PT') + self.assertEqual(self.pa.encode_alpha('PETER'), 'PT') + self.assertEqual(self.pa.encode_alpha('Knabe'), 'NP') + self.assertEqual(self.pa.encode_alpha('Phil'), 'FL') + self.assertEqual(self.pa.encode_alpha('Wieland'), 'VL') + # Test wrapper self.assertEqual(pshp_soundex_first('Giles'), 'J400') diff --git a/tests/phonetic/test_phonetic_pshp_soundex_last.py b/tests/phonetic/test_phonetic_pshp_soundex_last.py index 09b236bae..267e93c00 100644 --- a/tests/phonetic/test_phonetic_pshp_soundex_last.py +++ b/tests/phonetic/test_phonetic_pshp_soundex_last.py @@ -40,6 +40,8 @@ class PSHPSoundexLastTestCases(unittest.TestCase): """ pa = PSHPSoundexLast() + pa_german = PSHPSoundexLast(german=True) + pa_unl = PSHPSoundexLast(max_length=-1) def test_pshp_soundex_last(self): """Test abydos.phonetic.PSHPSoundexLast.""" @@ -63,21 +65,29 @@ def test_pshp_soundex_last(self): self.assertEqual(self.pa.encode('Knight'), 'N230') self.assertEqual(self.pa.encode('Phillip'), 'F410') self.assertEqual(self.pa.encode('Wein'), 'V500') - self.assertEqual(self.pa.encode('Wagner', german=True), 'V255') + self.assertEqual(self.pa_german.encode('Wagner'), 'V255') self.assertEqual(self.pa.encode('Pence'), 'P500') self.assertEqual(self.pa.encode('Less'), 'L000') self.assertEqual(self.pa.encode('Simpson'), 'S525') self.assertEqual(self.pa.encode('Samson'), 'S250') self.assertEqual(self.pa.encode('Lang'), 'L500') self.assertEqual(self.pa.encode('Hagan'), 'H500') - self.assertEqual(self.pa.encode('Cartes', german=True), 'K500') - self.assertEqual(self.pa.encode('Kats', german=True), 'K000') - self.assertEqual(self.pa.encode('Schultze', german=True), 'S400') - self.assertEqual(self.pa.encode('Alze', german=True), 'A400') - self.assertEqual(self.pa.encode('Galz', german=True), 'G400') - self.assertEqual(self.pa.encode('Alte', german=True), 'A400') - self.assertEqual(self.pa.encode('Alte', max_length=-1), 'A43') - self.assertEqual(self.pa.encode('Altemaier', max_length=-1), 'A4355') + self.assertEqual(self.pa_german.encode('Cartes'), 'K500') + self.assertEqual(self.pa_german.encode('Kats'), 'K000') + self.assertEqual(self.pa_german.encode('Schultze'), 'S400') + self.assertEqual(self.pa_german.encode('Alze'), 'A400') + self.assertEqual(self.pa_german.encode('Galz'), 'G400') + self.assertEqual(self.pa_german.encode('Alte'), 'A400') + self.assertEqual(self.pa_unl.encode('Alte'), 'A43') + self.assertEqual(self.pa_unl.encode('Altemaier'), 'A4355') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Simpson'), 'SNKN') + self.assertEqual(self.pa.encode_alpha('Samson'), 'SKN') + self.assertEqual(self.pa.encode_alpha('Lang'), 'LN') + self.assertEqual(self.pa.encode_alpha('Hagan'), 'HN') + self.assertEqual(self.pa_german.encode_alpha('Cartes'), 'KN') + self.assertEqual(self.pa_german.encode_alpha('Kats'), 'K') # Test wrapper self.assertEqual(pshp_soundex_last('Smith'), 'S530') diff --git a/tests/phonetic/test_phonetic_refined_soundex.py b/tests/phonetic/test_phonetic_refined_soundex.py index d644dadf8..8a6e6e5d8 100644 --- a/tests/phonetic/test_phonetic_refined_soundex.py +++ b/tests/phonetic/test_phonetic_refined_soundex.py @@ -44,294 +44,207 @@ class RefinedSoundexTestCases(unittest.TestCase): def test_refined_soundex(self): """Test abydos.phonetic.RefinedSoundex.""" # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html - self.assertEqual(self.pa.encode('Braz'), 'B195') - self.assertEqual(self.pa.encode('Broz'), 'B195') - self.assertEqual(self.pa.encode('Caren'), 'C398') - self.assertEqual(self.pa.encode('Caron'), 'C398') - self.assertEqual(self.pa.encode('Carren'), 'C398') - self.assertEqual(self.pa.encode('Charon'), 'C398') - self.assertEqual(self.pa.encode('Corain'), 'C398') - self.assertEqual(self.pa.encode('Coram'), 'C398') - self.assertEqual(self.pa.encode('Corran'), 'C398') - self.assertEqual(self.pa.encode('Corrin'), 'C398') - self.assertEqual(self.pa.encode('Corwin'), 'C398') - self.assertEqual(self.pa.encode('Curran'), 'C398') - self.assertEqual(self.pa.encode('Curreen'), 'C398') - self.assertEqual(self.pa.encode('Currin'), 'C398') - self.assertEqual(self.pa.encode('Currom'), 'C398') - self.assertEqual(self.pa.encode('Currum'), 'C398') - self.assertEqual(self.pa.encode('Curwen'), 'C398') - self.assertEqual(self.pa.encode('Caren'), 'C398') - self.assertEqual(self.pa.encode('Caren'), 'C398') - self.assertEqual(self.pa.encode('Caren'), 'C398') - self.assertEqual(self.pa.encode('Caren'), 'C398') - self.assertEqual(self.pa.encode('Caren'), 'C398') - self.assertEqual(self.pa.encode('Caren'), 'C398') - self.assertEqual(self.pa.encode('Caren'), 'C398') + self.assertEqual(self.pa.encode('Braz'), 'B95') + self.assertEqual(self.pa.encode('Broz'), 'B95') + self.assertEqual(self.pa.encode('Caren'), 'C98') + self.assertEqual(self.pa.encode('Caron'), 'C98') + self.assertEqual(self.pa.encode('Carren'), 'C98') + self.assertEqual(self.pa.encode('Charon'), 'C98') + self.assertEqual(self.pa.encode('Corain'), 'C98') + self.assertEqual(self.pa.encode('Coram'), 'C98') + self.assertEqual(self.pa.encode('Corran'), 'C98') + self.assertEqual(self.pa.encode('Corrin'), 'C98') + self.assertEqual(self.pa.encode('Corwin'), 'C98') + self.assertEqual(self.pa.encode('Curran'), 'C98') + self.assertEqual(self.pa.encode('Curreen'), 'C98') + self.assertEqual(self.pa.encode('Currin'), 'C98') + self.assertEqual(self.pa.encode('Currom'), 'C98') + self.assertEqual(self.pa.encode('Currum'), 'C98') + self.assertEqual(self.pa.encode('Curwen'), 'C98') self.assertEqual(self.pa.encode('Hairs'), 'H93') self.assertEqual(self.pa.encode('Hark'), 'H93') self.assertEqual(self.pa.encode('Hars'), 'H93') self.assertEqual(self.pa.encode('Hayers'), 'H93') self.assertEqual(self.pa.encode('Heers'), 'H93') self.assertEqual(self.pa.encode('Hiers'), 'H93') - self.assertEqual(self.pa.encode('Lambard'), 'L78196') - self.assertEqual(self.pa.encode('Lambart'), 'L78196') - self.assertEqual(self.pa.encode('Lambert'), 'L78196') - self.assertEqual(self.pa.encode('Lambird'), 'L78196') - self.assertEqual(self.pa.encode('Lampaert'), 'L78196') - self.assertEqual(self.pa.encode('Lampard'), 'L78196') - self.assertEqual(self.pa.encode('Lampart'), 'L78196') - self.assertEqual(self.pa.encode('Lamperd'), 'L78196') - self.assertEqual(self.pa.encode('Lampert'), 'L78196') - self.assertEqual(self.pa.encode('Lamport'), 'L78196') - self.assertEqual(self.pa.encode('Limbert'), 'L78196') - self.assertEqual(self.pa.encode('Lombard'), 'L78196') - self.assertEqual(self.pa.encode('Nolton'), 'N8768') - self.assertEqual(self.pa.encode('Noulton'), 'N8768') + self.assertEqual(self.pa.encode('Lambard'), 'L8196') + self.assertEqual(self.pa.encode('Lambart'), 'L8196') + self.assertEqual(self.pa.encode('Lambert'), 'L8196') + self.assertEqual(self.pa.encode('Lambird'), 'L8196') + self.assertEqual(self.pa.encode('Lampaert'), 'L8196') + self.assertEqual(self.pa.encode('Lampard'), 'L8196') + self.assertEqual(self.pa.encode('Lampart'), 'L8196') + self.assertEqual(self.pa.encode('Lamperd'), 'L8196') + self.assertEqual(self.pa.encode('Lampert'), 'L8196') + self.assertEqual(self.pa.encode('Lamport'), 'L8196') + self.assertEqual(self.pa.encode('Limbert'), 'L8196') + self.assertEqual(self.pa.encode('Lombard'), 'L8196') + self.assertEqual(self.pa.encode('Nolton'), 'N768') + self.assertEqual(self.pa.encode('Noulton'), 'N768') # http://trimc-nlp.blogspot.com/2015/03/the-soundex-algorithm.html - self.assertEqual(self.pa.encode('Craig'), 'C394') - self.assertEqual(self.pa.encode('Crag'), 'C394') - self.assertEqual(self.pa.encode('Crejg'), 'C394') - self.assertEqual(self.pa.encode('Creig'), 'C394') - self.assertEqual(self.pa.encode('Craigg'), 'C394') - self.assertEqual(self.pa.encode('Craug'), 'C394') - self.assertEqual(self.pa.encode('Craiggg'), 'C394') - self.assertEqual(self.pa.encode('Creg'), 'C394') - self.assertEqual(self.pa.encode('Cregg'), 'C394') - self.assertEqual(self.pa.encode('Creag'), 'C394') - self.assertEqual(self.pa.encode('Greg'), 'G494') - self.assertEqual(self.pa.encode('Gregg'), 'G494') - self.assertEqual(self.pa.encode('Graig'), 'G494') - self.assertEqual(self.pa.encode('Greig'), 'G494') - self.assertEqual(self.pa.encode('Greggg'), 'G494') - self.assertEqual(self.pa.encode('Groeg'), 'G494') - self.assertEqual(self.pa.encode('Graj'), 'G494') - self.assertEqual(self.pa.encode('Grej'), 'G494') - self.assertEqual(self.pa.encode('Grreg'), 'G494') - self.assertEqual(self.pa.encode('Greag'), 'G494') - self.assertEqual(self.pa.encode('Grig'), 'G494') - self.assertEqual(self.pa.encode('Kregg'), 'K394') - self.assertEqual(self.pa.encode('Kraig'), 'K394') - self.assertEqual(self.pa.encode('Krag'), 'K394') - self.assertEqual(self.pa.encode('Kreig'), 'K394') - self.assertEqual(self.pa.encode('Krug'), 'K394') - self.assertEqual(self.pa.encode('Kreg'), 'K394') - self.assertEqual(self.pa.encode('Krieg'), 'K394') - self.assertEqual(self.pa.encode('Krijg'), 'K394') + self.assertEqual(self.pa.encode('Craig'), 'C94') + self.assertEqual(self.pa.encode('Crag'), 'C94') + self.assertEqual(self.pa.encode('Crejg'), 'C94') + self.assertEqual(self.pa.encode('Creig'), 'C94') + self.assertEqual(self.pa.encode('Craigg'), 'C94') + self.assertEqual(self.pa.encode('Craug'), 'C94') + self.assertEqual(self.pa.encode('Craiggg'), 'C94') + self.assertEqual(self.pa.encode('Creg'), 'C94') + self.assertEqual(self.pa.encode('Cregg'), 'C94') + self.assertEqual(self.pa.encode('Creag'), 'C94') + self.assertEqual(self.pa.encode('Greg'), 'G94') + self.assertEqual(self.pa.encode('Gregg'), 'G94') + self.assertEqual(self.pa.encode('Graig'), 'G94') + self.assertEqual(self.pa.encode('Greig'), 'G94') + self.assertEqual(self.pa.encode('Greggg'), 'G94') + self.assertEqual(self.pa.encode('Groeg'), 'G94') + self.assertEqual(self.pa.encode('Graj'), 'G94') + self.assertEqual(self.pa.encode('Grej'), 'G94') + self.assertEqual(self.pa.encode('Grreg'), 'G94') + self.assertEqual(self.pa.encode('Greag'), 'G94') + self.assertEqual(self.pa.encode('Grig'), 'G94') + self.assertEqual(self.pa.encode('Kregg'), 'K94') + self.assertEqual(self.pa.encode('Kraig'), 'K94') + self.assertEqual(self.pa.encode('Krag'), 'K94') + self.assertEqual(self.pa.encode('Kreig'), 'K94') + self.assertEqual(self.pa.encode('Krug'), 'K94') + self.assertEqual(self.pa.encode('Kreg'), 'K94') + self.assertEqual(self.pa.encode('Krieg'), 'K94') + self.assertEqual(self.pa.encode('Krijg'), 'K94') # Apache Commons test cases # http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/RefinedSoundexTest.java?view=markup - self.assertEqual(self.pa.encode('testing'), 'T63684') - self.assertEqual(self.pa.encode('TESTING'), 'T63684') - self.assertEqual(self.pa.encode('The'), 'T6') - self.assertEqual(self.pa.encode('quick'), 'Q53') - self.assertEqual(self.pa.encode('brown'), 'B198') - self.assertEqual(self.pa.encode('fox'), 'F25') - self.assertEqual(self.pa.encode('jumped'), 'J4816') + self.assertEqual(self.pa.encode('testing'), 'T3684') + self.assertEqual(self.pa.encode('TESTING'), 'T3684') + self.assertEqual(self.pa.encode('The'), 'T') + self.assertEqual(self.pa.encode('quick'), 'Q3') + self.assertEqual(self.pa.encode('brown'), 'B98') + self.assertEqual(self.pa.encode('fox'), 'F5') + self.assertEqual(self.pa.encode('jumped'), 'J816') self.assertEqual(self.pa.encode('over'), 'O29') - self.assertEqual(self.pa.encode('the'), 'T6') - self.assertEqual(self.pa.encode('lazy'), 'L75') - self.assertEqual(self.pa.encode('dogs'), 'D643') + self.assertEqual(self.pa.encode('the'), 'T') + self.assertEqual(self.pa.encode('lazy'), 'L5') + self.assertEqual(self.pa.encode('dogs'), 'D43') # Test with retain_vowels=True # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html - self.assertEqual(self.pa.encode('Braz', retain_vowels=True), 'B1905') - self.assertEqual(self.pa.encode('Broz', retain_vowels=True), 'B1905') - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Caron', retain_vowels=True), 'C30908') - self.assertEqual( - self.pa.encode('Carren', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Charon', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Corain', retain_vowels=True), 'C30908' - ) - self.assertEqual(self.pa.encode('Coram', retain_vowels=True), 'C30908') - self.assertEqual( - self.pa.encode('Corran', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Corrin', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Corwin', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Curran', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Curreen', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Currin', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Currom', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Currum', retain_vowels=True), 'C30908' - ) - self.assertEqual( - self.pa.encode('Curwen', retain_vowels=True), 'C30908' - ) - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Caren', retain_vowels=True), 'C30908') - self.assertEqual(self.pa.encode('Hairs', retain_vowels=True), 'H093') - self.assertEqual(self.pa.encode('Hark', retain_vowels=True), 'H093') - self.assertEqual(self.pa.encode('Hars', retain_vowels=True), 'H093') - self.assertEqual(self.pa.encode('Hayers', retain_vowels=True), 'H093') - self.assertEqual(self.pa.encode('Heers', retain_vowels=True), 'H093') - self.assertEqual(self.pa.encode('Hiers', retain_vowels=True), 'H093') - self.assertEqual( - self.pa.encode('Lambard', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lambart', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lambert', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lambird', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lampaert', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lampard', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lampart', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lamperd', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lampert', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lamport', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Limbert', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Lombard', retain_vowels=True), 'L7081096' - ) - self.assertEqual( - self.pa.encode('Nolton', retain_vowels=True), 'N807608' - ) - self.assertEqual( - self.pa.encode('Noulton', retain_vowels=True), 'N807608' - ) + pa_vowels = RefinedSoundex(retain_vowels=True) + self.assertEqual(pa_vowels.encode('Braz'), 'B905') + self.assertEqual(pa_vowels.encode('Broz'), 'B905') + self.assertEqual(pa_vowels.encode('Caren'), 'C0908') + self.assertEqual(pa_vowels.encode('Caron'), 'C0908') + self.assertEqual(pa_vowels.encode('Carren'), 'C0908') + self.assertEqual(pa_vowels.encode('Charon'), 'C0908') + self.assertEqual(pa_vowels.encode('Corain'), 'C0908') + self.assertEqual(pa_vowels.encode('Coram'), 'C0908') + self.assertEqual(pa_vowels.encode('Corran'), 'C0908') + self.assertEqual(pa_vowels.encode('Corrin'), 'C0908') + self.assertEqual(pa_vowels.encode('Corwin'), 'C0908') + self.assertEqual(pa_vowels.encode('Curran'), 'C0908') + self.assertEqual(pa_vowels.encode('Curreen'), 'C0908') + self.assertEqual(pa_vowels.encode('Currin'), 'C0908') + self.assertEqual(pa_vowels.encode('Currom'), 'C0908') + self.assertEqual(pa_vowels.encode('Currum'), 'C0908') + self.assertEqual(pa_vowels.encode('Curwen'), 'C0908') + self.assertEqual(pa_vowels.encode('Hairs'), 'H093') + self.assertEqual(pa_vowels.encode('Hark'), 'H093') + self.assertEqual(pa_vowels.encode('Hars'), 'H093') + self.assertEqual(pa_vowels.encode('Hayers'), 'H093') + self.assertEqual(pa_vowels.encode('Heers'), 'H093') + self.assertEqual(pa_vowels.encode('Hiers'), 'H093') + self.assertEqual(pa_vowels.encode('Lambard'), 'L081096') + self.assertEqual(pa_vowels.encode('Lambart'), 'L081096') + self.assertEqual(pa_vowels.encode('Lambert'), 'L081096') + self.assertEqual(pa_vowels.encode('Lambird'), 'L081096') + self.assertEqual(pa_vowels.encode('Lampaert'), 'L081096') + self.assertEqual(pa_vowels.encode('Lampard'), 'L081096') + self.assertEqual(pa_vowels.encode('Lampart'), 'L081096') + self.assertEqual(pa_vowels.encode('Lamperd'), 'L081096') + self.assertEqual(pa_vowels.encode('Lampert'), 'L081096') + self.assertEqual(pa_vowels.encode('Lamport'), 'L081096') + self.assertEqual(pa_vowels.encode('Limbert'), 'L081096') + self.assertEqual(pa_vowels.encode('Lombard'), 'L081096') + self.assertEqual(pa_vowels.encode('Nolton'), 'N07608') + self.assertEqual(pa_vowels.encode('Noulton'), 'N07608') # http://trimc-nlp.blogspot.com/2015/03/the-soundex-algorithm.html - self.assertEqual(self.pa.encode('Craig', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Crag', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Crejg', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Creig', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Craigg', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Craug', retain_vowels=True), 'C3904') - self.assertEqual( - self.pa.encode('Craiggg', retain_vowels=True), 'C3904' - ) - self.assertEqual(self.pa.encode('Creg', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Cregg', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Creag', retain_vowels=True), 'C3904') - self.assertEqual(self.pa.encode('Greg', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Gregg', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Graig', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Greig', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Greggg', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Groeg', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Graj', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Grej', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Grreg', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Greag', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Grig', retain_vowels=True), 'G4904') - self.assertEqual(self.pa.encode('Kregg', retain_vowels=True), 'K3904') - self.assertEqual(self.pa.encode('Kraig', retain_vowels=True), 'K3904') - self.assertEqual(self.pa.encode('Krag', retain_vowels=True), 'K3904') - self.assertEqual(self.pa.encode('Kreig', retain_vowels=True), 'K3904') - self.assertEqual(self.pa.encode('Krug', retain_vowels=True), 'K3904') - self.assertEqual(self.pa.encode('Kreg', retain_vowels=True), 'K3904') - self.assertEqual(self.pa.encode('Krieg', retain_vowels=True), 'K3904') - self.assertEqual(self.pa.encode('Krijg', retain_vowels=True), 'K3904') + self.assertEqual(pa_vowels.encode('Craig'), 'C904') + self.assertEqual(pa_vowels.encode('Crag'), 'C904') + self.assertEqual(pa_vowels.encode('Crejg'), 'C904') + self.assertEqual(pa_vowels.encode('Creig'), 'C904') + self.assertEqual(pa_vowels.encode('Craigg'), 'C904') + self.assertEqual(pa_vowels.encode('Craug'), 'C904') + self.assertEqual(pa_vowels.encode('Craiggg'), 'C904') + self.assertEqual(pa_vowels.encode('Creg'), 'C904') + self.assertEqual(pa_vowels.encode('Cregg'), 'C904') + self.assertEqual(pa_vowels.encode('Creag'), 'C904') + self.assertEqual(pa_vowels.encode('Greg'), 'G904') + self.assertEqual(pa_vowels.encode('Gregg'), 'G904') + self.assertEqual(pa_vowels.encode('Graig'), 'G904') + self.assertEqual(pa_vowels.encode('Greig'), 'G904') + self.assertEqual(pa_vowels.encode('Greggg'), 'G904') + self.assertEqual(pa_vowels.encode('Groeg'), 'G904') + self.assertEqual(pa_vowels.encode('Graj'), 'G904') + self.assertEqual(pa_vowels.encode('Grej'), 'G904') + self.assertEqual(pa_vowels.encode('Grreg'), 'G904') + self.assertEqual(pa_vowels.encode('Greag'), 'G904') + self.assertEqual(pa_vowels.encode('Grig'), 'G904') + self.assertEqual(pa_vowels.encode('Kregg'), 'K904') + self.assertEqual(pa_vowels.encode('Kraig'), 'K904') + self.assertEqual(pa_vowels.encode('Krag'), 'K904') + self.assertEqual(pa_vowels.encode('Kreig'), 'K904') + self.assertEqual(pa_vowels.encode('Krug'), 'K904') + self.assertEqual(pa_vowels.encode('Kreg'), 'K904') + self.assertEqual(pa_vowels.encode('Krieg'), 'K904') + self.assertEqual(pa_vowels.encode('Krijg'), 'K904') # Apache Commons test cases # http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/RefinedSoundexTest.java?view=markup - self.assertEqual( - self.pa.encode('testing', retain_vowels=True), 'T6036084' - ) - self.assertEqual( - self.pa.encode('TESTING', retain_vowels=True), 'T6036084' - ) - self.assertEqual(self.pa.encode('The', retain_vowels=True), 'T60') - self.assertEqual(self.pa.encode('quick', retain_vowels=True), 'Q503') - self.assertEqual(self.pa.encode('brown', retain_vowels=True), 'B1908') - self.assertEqual(self.pa.encode('fox', retain_vowels=True), 'F205') - self.assertEqual( - self.pa.encode('jumped', retain_vowels=True), 'J408106' - ) - self.assertEqual(self.pa.encode('over', retain_vowels=True), 'O0209') - self.assertEqual(self.pa.encode('the', retain_vowels=True), 'T60') - self.assertEqual(self.pa.encode('lazy', retain_vowels=True), 'L7050') - self.assertEqual(self.pa.encode('dogs', retain_vowels=True), 'D6043') + self.assertEqual(pa_vowels.encode('testing'), 'T036084') + self.assertEqual(pa_vowels.encode('TESTING'), 'T036084') + self.assertEqual(pa_vowels.encode('The'), 'T0') + self.assertEqual(pa_vowels.encode('quick'), 'Q03') + self.assertEqual(pa_vowels.encode('brown'), 'B908') + self.assertEqual(pa_vowels.encode('fox'), 'F05') + self.assertEqual(pa_vowels.encode('jumped'), 'J08106') + self.assertEqual(pa_vowels.encode('over'), 'O209') + self.assertEqual(pa_vowels.encode('the'), 'T0') + self.assertEqual(pa_vowels.encode('lazy'), 'L050') + self.assertEqual(pa_vowels.encode('dogs'), 'D043') # length tests - self.assertEqual( - self.pa.encode('testing', max_length=4, zero_pad=True), 'T636' - ) - self.assertEqual( - self.pa.encode('TESTING', max_length=4, zero_pad=True), 'T636' - ) - self.assertEqual( - self.pa.encode('The', max_length=4, zero_pad=True), 'T600' - ) - self.assertEqual( - self.pa.encode('quick', max_length=4, zero_pad=True), 'Q530' - ) - self.assertEqual( - self.pa.encode('brown', max_length=4, zero_pad=True), 'B198' - ) - self.assertEqual( - self.pa.encode('fox', max_length=4, zero_pad=True), 'F250' - ) - self.assertEqual( - self.pa.encode('jumped', max_length=4, zero_pad=True), 'J481' - ) - self.assertEqual( - self.pa.encode('over', max_length=4, zero_pad=True), 'O290' - ) - self.assertEqual( - self.pa.encode('the', max_length=4, zero_pad=True), 'T600' - ) - self.assertEqual( - self.pa.encode('lazy', max_length=4, zero_pad=True), 'L750' - ) - self.assertEqual( - self.pa.encode('dogs', max_length=4, zero_pad=True), 'D643' - ) - self.assertEqual(self.pa.encode('The', max_length=4), 'T6') - self.assertEqual(self.pa.encode('quick', max_length=4), 'Q53') - self.assertEqual(self.pa.encode('brown', max_length=4), 'B198') - self.assertEqual(self.pa.encode('fox', max_length=4), 'F25') - self.assertEqual(self.pa.encode('jumped', max_length=4), 'J481') - self.assertEqual(self.pa.encode('over', max_length=4), 'O29') - self.assertEqual(self.pa.encode('the', max_length=4), 'T6') - self.assertEqual(self.pa.encode('lazy', max_length=4), 'L75') - self.assertEqual(self.pa.encode('dogs', max_length=4), 'D643') + pa_40 = RefinedSoundex(max_length=4, zero_pad=True) + self.assertEqual(pa_40.encode('testing'), 'T368') + self.assertEqual(pa_40.encode('TESTING'), 'T368') + self.assertEqual(pa_40.encode('The'), 'T000') + self.assertEqual(pa_40.encode('quick'), 'Q300') + self.assertEqual(pa_40.encode('brown'), 'B980') + self.assertEqual(pa_40.encode('fox'), 'F500') + self.assertEqual(pa_40.encode('jumped'), 'J816') + self.assertEqual(pa_40.encode('over'), 'O290') + self.assertEqual(pa_40.encode('the'), 'T000') + self.assertEqual(pa_40.encode('lazy'), 'L500') + self.assertEqual(pa_40.encode('dogs'), 'D430') + pa_4 = RefinedSoundex(max_length=4) + self.assertEqual(pa_4.encode('The'), 'T') + self.assertEqual(pa_4.encode('quick'), 'Q3') + self.assertEqual(pa_4.encode('brown'), 'B98') + self.assertEqual(pa_4.encode('fox'), 'F5') + self.assertEqual(pa_4.encode('jumped'), 'J816') + self.assertEqual(pa_4.encode('over'), 'O29') + self.assertEqual(pa_4.encode('the'), 'T') + self.assertEqual(pa_4.encode('lazy'), 'L5') + self.assertEqual(pa_4.encode('dogs'), 'D43') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Broz'), 'BRZ') + self.assertEqual(self.pa.encode_alpha('Caren'), 'CRN') + self.assertEqual(self.pa.encode_alpha('Hairs'), 'HRK') + self.assertEqual(self.pa.encode_alpha('Lamperd'), 'LNPRT') # Test wrapper - self.assertEqual(refined_soundex('Braz'), 'B195') + self.assertEqual(refined_soundex('Braz'), 'B95') if __name__ == '__main__': diff --git a/tests/phonetic/test_phonetic_roger_root.py b/tests/phonetic/test_phonetic_roger_root.py index 404616450..0b0301cc9 100644 --- a/tests/phonetic/test_phonetic_roger_root.py +++ b/tests/phonetic/test_phonetic_roger_root.py @@ -125,26 +125,33 @@ def test_roger_root(self): self.assertEqual(self.pa.encode('Lüdenscheidt'), '05126') # no zero_pad - self.assertEqual(self.pa.encode('BROWNER', zero_pad=False), '09424') - self.assertEqual(self.pa.encode('STANLEY', zero_pad=False), '00125') - self.assertEqual(self.pa.encode('CHALMAN', zero_pad=False), '06532') - self.assertEqual(self.pa.encode('CHING', zero_pad=False), '0627') - self.assertEqual(self.pa.encode('ANDERSON', zero_pad=False), '12140') - self.assertEqual(self.pa.encode('OVERSTREET, zero_pad=False'), '18401') - self.assertEqual(self.pa.encode('HECKEL', zero_pad=False), '275') - self.assertEqual(self.pa.encode('WYSZYNSKI', zero_pad=False), '40207') - self.assertEqual(self.pa.encode('WHITTED', zero_pad=False), '411') - self.assertEqual(self.pa.encode('ONGOQO', zero_pad=False), '1277') - self.assertEqual(self.pa.encode('JOHNSON', zero_pad=False), '3202') - self.assertEqual(self.pa.encode('WILLIAMS', zero_pad=False), '4530') - self.assertEqual(self.pa.encode('SMITH', zero_pad=False), '0031') - self.assertEqual(self.pa.encode('JONES', zero_pad=False), '320') - self.assertEqual(self.pa.encode('BROWN', zero_pad=False), '0942') - self.assertEqual(self.pa.encode('DAVIS', zero_pad=False), '0180') - self.assertEqual(self.pa.encode('JACKSON', zero_pad=False), '3702') - self.assertEqual(self.pa.encode('WILSON', zero_pad=False), '4502') - self.assertEqual(self.pa.encode('LEE', zero_pad=False), '05') - self.assertEqual(self.pa.encode('THOMAS', zero_pad=False), '0130') + nzp = RogerRoot(zero_pad=False) + self.assertEqual(nzp.encode('BROWNER'), '09424') + self.assertEqual(nzp.encode('STANLEY'), '00125') + self.assertEqual(nzp.encode('CHALMAN'), '06532') + self.assertEqual(nzp.encode('CHING'), '0627') + self.assertEqual(nzp.encode('ANDERSON'), '12140') + self.assertEqual(nzp.encode('OVERSTREET'), '18401') + self.assertEqual(nzp.encode('HECKEL'), '275') + self.assertEqual(nzp.encode('WYSZYNSKI'), '40207') + self.assertEqual(nzp.encode('WHITTED'), '411') + self.assertEqual(nzp.encode('ONGOQO'), '1277') + self.assertEqual(nzp.encode('JOHNSON'), '3202') + self.assertEqual(nzp.encode('WILLIAMS'), '4530') + self.assertEqual(nzp.encode('SMITH'), '0031') + self.assertEqual(nzp.encode('JONES'), '320') + self.assertEqual(nzp.encode('BROWN'), '0942') + self.assertEqual(nzp.encode('DAVIS'), '0180') + self.assertEqual(nzp.encode('JACKSON'), '3702') + self.assertEqual(nzp.encode('WILSON'), '4502') + self.assertEqual(nzp.encode('LEE'), '05') + self.assertEqual(nzp.encode('THOMAS'), '0130') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('BROWNER'), 'PRNR') + self.assertEqual(self.pa.encode_alpha('STANLEY'), 'STNL') + self.assertEqual(self.pa.encode_alpha('CHALMAN'), 'JLMN') + self.assertEqual(self.pa.encode_alpha('CHING'), 'JNK') # Test wrapper self.assertEqual(roger_root('BROWNER'), '09424') diff --git a/tests/phonetic/test_phonetic_russell.py b/tests/phonetic/test_phonetic_russell_index.py similarity index 100% rename from tests/phonetic/test_phonetic_russell.py rename to tests/phonetic/test_phonetic_russell_index.py diff --git a/tests/phonetic/test_phonetic_sv.py b/tests/phonetic/test_phonetic_sfinxbis.py similarity index 87% rename from tests/phonetic/test_phonetic_sv.py rename to tests/phonetic/test_phonetic_sfinxbis.py index 125107f5b..308372db4 100644 --- a/tests/phonetic/test_phonetic_sv.py +++ b/tests/phonetic/test_phonetic_sfinxbis.py @@ -40,6 +40,7 @@ class SfinxBisTestCases(unittest.TestCase): """ pa = SfinxBis() + pa4 = SfinxBis(4) def test_sfinxbis(self): """Test abydos.phonetic.SfinxBis.""" @@ -82,8 +83,8 @@ def test_sfinxbis(self): self.assertEqual(self.pa.encode('von Ahn'), ('$5',)) self.assertEqual(self.pa.encode('von Dem Knesebeck'), ('K5812',)) self.assertEqual(self.pa.encode('von Der Burg'), ('B62',)) - self.assertEqual(self.pa.encode('D\'Angelo'), ('D524',)) - self.assertEqual(self.pa.encode('O\'Conner'), ('$256',)) + self.assertEqual(self.pa.encode("D'Angelo"), ('D524',)) + self.assertEqual(self.pa.encode("O'Conner"), ('$256',)) self.assertEqual(self.pa.encode('Los'), ('L8',)) self.assertEqual(self.pa.encode('Mac'), ('M2',)) self.assertEqual(self.pa.encode('Till'), ('T4',)) @@ -252,27 +253,27 @@ def test_sfinxbis(self): # a few max_length tests self.assertEqual( - self.pa.encode('Kiviniemi Birgersson', 3), ('#75', 'B62') + SfinxBis(3).encode('Kiviniemi Birgersson'), ('#75', 'B62') ) - self.assertEqual(self.pa.encode('Eichorn', 4), ('$265',)) - self.assertEqual(self.pa.encode('Friedrich', 4), ('F636',)) - self.assertEqual(self.pa.encode('Grantcharova', 4), ('G653',)) - self.assertEqual(self.pa.encode('Ilichev', 4), ('$427',)) - self.assertEqual(self.pa.encode('Ivankovic', 4), ('$752',)) - self.assertEqual(self.pa.encode('Ivangurich', 4), ('$752',)) - self.assertEqual(self.pa.encode('Kinch', 4), ('#52',)) - self.assertEqual(self.pa.encode('Kirchmann', 4), ('#625',)) - self.assertEqual(self.pa.encode('Machado', 4), ('M23',)) - self.assertEqual(self.pa.encode('Reich', 4), ('R2',)) - self.assertEqual(self.pa.encode('Roche', 4), ('R2',)) - self.assertEqual(self.pa.encode('Rubaszkin', 4), ('R182',)) - self.assertEqual(self.pa.encode('Rubaschkin', 4), ('R182',)) - self.assertEqual(self.pa.encode('Sanchez', 4), ('S528',)) - self.assertEqual(self.pa.encode('Walukiewicz', 4), ('V427',)) - self.assertEqual(self.pa.encode('Valukievitch', 4), ('V427',)) - self.assertEqual(self.pa.encode('K', 4), ('K',)) - self.assertEqual(self.pa.encode('2010', 4), ('',)) - self.assertEqual(self.pa.encode('cese', 4), ('S8',)) + self.assertEqual(self.pa4.encode('Eichorn'), ('$265',)) + self.assertEqual(self.pa4.encode('Friedrich'), ('F636',)) + self.assertEqual(self.pa4.encode('Grantcharova'), ('G653',)) + self.assertEqual(self.pa4.encode('Ilichev'), ('$427',)) + self.assertEqual(self.pa4.encode('Ivankovic'), ('$752',)) + self.assertEqual(self.pa4.encode('Ivangurich'), ('$752',)) + self.assertEqual(self.pa4.encode('Kinch'), ('#52',)) + self.assertEqual(self.pa4.encode('Kirchmann'), ('#625',)) + self.assertEqual(self.pa4.encode('Machado'), ('M23',)) + self.assertEqual(self.pa4.encode('Reich'), ('R2',)) + self.assertEqual(self.pa4.encode('Roche'), ('R2',)) + self.assertEqual(self.pa4.encode('Rubaszkin'), ('R182',)) + self.assertEqual(self.pa4.encode('Rubaschkin'), ('R182',)) + self.assertEqual(self.pa4.encode('Sanchez'), ('S528',)) + self.assertEqual(self.pa4.encode('Walukiewicz'), ('V427',)) + self.assertEqual(self.pa4.encode('Valukievitch'), ('V427',)) + self.assertEqual(self.pa4.encode('K'), ('K',)) + self.assertEqual(self.pa4.encode('2010'), ('',)) + self.assertEqual(self.pa4.encode('cese'), ('S8',)) # etc. (for code coverage) self.assertEqual(self.pa.encode('chans'), ('#58',)) @@ -284,8 +285,22 @@ def test_sfinxbis(self): self.assertEqual(self.pa.encode('skil'), ('#4',)) # max_length bounds tests - self.assertEqual(self.pa.encode('Niall', max_length=-1), ('N4',)) - self.assertEqual(self.pa.encode('Niall', max_length=0), ('N4',)) + self.assertEqual(SfinxBis(max_length=-1).encode('Niall'), ('N4',)) + self.assertEqual(SfinxBis(max_length=0).encode('Niall'), ('N4',)) + + # encode_alpha + self.assertEqual( + self.pa.encode_alpha('Stael von Holstein'), ('STL', 'HLSTKN') + ) + self.assertEqual( + self.pa.encode_alpha('de Oliveira e Silva'), ('$LFKR', 'SLF') + ) + self.assertEqual( + self.pa.encode_alpha('de Alfaro y Gómez'), ('$LFR', 'GNS') + ) + self.assertEqual( + self.pa.encode_alpha('Arjaliès-de la Lande'), ('$RKLS', 'LNT') + ) # Test wrapper self.assertEqual(sfinxbis('af Sandeberg'), ('S53162',)) diff --git a/tests/phonetic/test_phonetic_sound_d.py b/tests/phonetic/test_phonetic_sound_d.py index f3216479f..77719de48 100644 --- a/tests/phonetic/test_phonetic_sound_d.py +++ b/tests/phonetic/test_phonetic_sound_d.py @@ -45,7 +45,7 @@ def test_sound_d(self): """Test abydos.phonetic.SoundD.""" # Base cases self.assertEqual(self.pa.encode(''), '0000') - self.assertEqual(self.pa.encode('', max_length=6), '000000') + self.assertEqual(SoundD(max_length=6).encode(''), '000000') self.assertEqual(self.pa.encode('knight'), '5300') self.assertEqual(self.pa.encode('accept'), '2130') @@ -59,9 +59,15 @@ def test_sound_d(self): self.assertEqual(self.pa.encode('rough'), '6000') self.assertEqual(self.pa.encode('x-ray'), '2600') self.assertEqual( - self.pa.encode('acetylcholine', max_length=-1), '234245' + SoundD(max_length=-1).encode('acetylcholine'), '234245' ) - self.assertEqual(self.pa.encode('rough', max_length=-1), '6') + self.assertEqual(SoundD(max_length=-1).encode('rough'), '6') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('pox'), 'PK') + self.assertEqual(self.pa.encode_alpha('anywhere'), 'NR') + self.assertEqual(self.pa.encode_alpha('adenosine'), 'TNKN') + self.assertEqual(self.pa.encode_alpha('judge'), 'KK') # Test wrapper self.assertEqual(sound_d('knight'), '5300') diff --git a/tests/phonetic/test_phonetic_soundex.py b/tests/phonetic/test_phonetic_soundex.py index 0956e221e..132009c9e 100644 --- a/tests/phonetic/test_phonetic_soundex.py +++ b/tests/phonetic/test_phonetic_soundex.py @@ -96,65 +96,65 @@ def test_soundex(self): self.assertEqual(self.pa.encode('Jackson-Jackson'), 'J252') # max_length tests - self.assertEqual(self.pa.encode('Lincoln', 10), 'L524500000') - self.assertEqual(self.pa.encode('Lincoln', 5), 'L5245') - self.assertEqual(self.pa.encode('Christopher', 6), 'C62316') + self.assertEqual(Soundex(10).encode('Lincoln'), 'L524500000') + self.assertEqual(Soundex(5).encode('Lincoln'), 'L5245') + self.assertEqual(Soundex(6).encode('Christopher'), 'C62316') # max_length bounds tests self.assertEqual( - self.pa.encode('Niall', max_length=-1), + Soundex(max_length=-1).encode('Niall'), 'N4000000000000000000000000000000000000000000000000' + '00000000000000', ) - self.assertEqual(self.pa.encode('Niall', max_length=0), 'N400') + self.assertEqual(Soundex(max_length=0).encode('Niall'), 'N400') # reverse tests - self.assertEqual(self.pa.encode('Rubin', reverse=True), 'N160') - self.assertEqual(self.pa.encode('Llyod', reverse=True), 'D400') - self.assertEqual(self.pa.encode('Lincoln', reverse=True), 'N425') - self.assertEqual(self.pa.encode('Knuth', reverse=True), 'H352') + self.assertEqual(Soundex(reverse=True).encode('Rubin'), 'N160') + self.assertEqual(Soundex(reverse=True).encode('Llyod'), 'D400') + self.assertEqual(Soundex(reverse=True).encode('Lincoln'), 'N425') + self.assertEqual(Soundex(reverse=True).encode('Knuth'), 'H352') # zero_pad tests self.assertEqual( - self.pa.encode('Niall', max_length=-1, zero_pad=False), 'N4' + Soundex(max_length=-1, zero_pad=False).encode('Niall'), 'N4' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=False), 'N4' + Soundex(max_length=0, zero_pad=False).encode('Niall'), 'N4' ) self.assertEqual( - self.pa.encode('Niall', max_length=0, zero_pad=True), 'N400' + Soundex(max_length=0, zero_pad=True).encode('Niall'), 'N400' ) - self.assertEqual(self.pa.encode('', max_length=4, zero_pad=False), '0') + self.assertEqual(Soundex(max_length=4, zero_pad=False).encode(''), '0') self.assertEqual( - self.pa.encode('', max_length=4, zero_pad=True), '0000' + Soundex(max_length=4, zero_pad=True).encode(''), '0000' ) + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Euler'), 'ELR') + self.assertEqual(self.pa.encode_alpha('Gauss'), 'GK') + self.assertEqual(self.pa.encode_alpha('Hilbert'), 'HLPR') + self.assertEqual(self.pa.encode_alpha('Knuth'), 'KNT') + # Test wrapper self.assertEqual(soundex('Euler'), 'E460') def test_soundex_special(self): """Test abydos.phonetic.Soundex (special 1880-1910 variant method).""" - self.assertEqual(self.pa.encode('Ashcroft', var='special'), 'A226') - self.assertEqual(self.pa.encode('Asicroft', var='special'), 'A226') - self.assertEqual(self.pa.encode('AsWcroft', var='special'), 'A226') - self.assertEqual(self.pa.encode('Rupert', var='special'), 'R163') - self.assertEqual(self.pa.encode('Rubin', var='special'), 'R150') + pa_special = Soundex(var='special') + self.assertEqual(pa_special.encode('Ashcroft'), 'A226') + self.assertEqual(pa_special.encode('Asicroft'), 'A226') + self.assertEqual(pa_special.encode('AsWcroft'), 'A226') + self.assertEqual(pa_special.encode('Rupert'), 'R163') + self.assertEqual(pa_special.encode('Rubin'), 'R150') def test_soundex_census(self): """Test abydos.phonetic.Soundex(Census variant method).""" - self.assertEqual( - self.pa.encode('Vandeusen', var='Census'), ('V532', 'D250') - ) - self.assertEqual( - self.pa.encode('van Deusen', var='Census'), ('V532', 'D250') - ) - self.assertEqual(self.pa.encode('McDonald', var='Census'), 'M235') - self.assertEqual( - self.pa.encode('la Cruz', var='Census'), ('L262', 'C620') - ) - self.assertEqual( - self.pa.encode('vanDamme', var='Census'), ('V535', 'D500') - ) + pa_census = Soundex(var='Census') + self.assertEqual(pa_census.encode('Vandeusen'), ('V532', 'D250')) + self.assertEqual(pa_census.encode('van Deusen'), ('V532', 'D250')) + self.assertEqual(pa_census.encode('McDonald'), 'M235') + self.assertEqual(pa_census.encode('la Cruz'), ('L262', 'C620')) + self.assertEqual(pa_census.encode('vanDamme'), ('V535', 'D500')) if __name__ == '__main__': diff --git a/tests/phonetic/test_phonetic_soundex_br.py b/tests/phonetic/test_phonetic_soundex_br.py index 30710cbce..228128dce 100644 --- a/tests/phonetic/test_phonetic_soundex_br.py +++ b/tests/phonetic/test_phonetic_soundex_br.py @@ -65,7 +65,13 @@ def test_soundex_br(self): self.assertEqual(self.pa.encode('Cici'), 'S200') self.assertEqual(self.pa.encode('Gerard'), 'J663') self.assertEqual(self.pa.encode('Yglesias'), 'I242') - self.assertEqual(self.pa.encode('Cici', zero_pad=False), 'S2') + self.assertEqual(SoundexBR(zero_pad=False).encode('Cici'), 'S2') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('Aecio Neves'), 'AKNP') + self.assertEqual(self.pa.encode_alpha('HILBERT'), 'ILPR') + self.assertEqual(self.pa.encode_alpha('Heilbronn'), 'ELPR') + self.assertEqual(self.pa.encode_alpha('Gauss'), 'GK') # Test wrapper self.assertEqual(soundex_br('Ana Karolina Kuhnen'), 'A526') diff --git a/tests/phonetic/test_phonetic_spanish_metaphone.py b/tests/phonetic/test_phonetic_spanish_metaphone.py index f52ee0dae..1d0522290 100644 --- a/tests/phonetic/test_phonetic_spanish_metaphone.py +++ b/tests/phonetic/test_phonetic_spanish_metaphone.py @@ -40,6 +40,7 @@ class SpanishMetaphoneTestCases(unittest.TestCase): """ pa = SpanishMetaphone() + pa_mod = SpanishMetaphone(modified=True) def test_spanish_metaphone(self): """Test abydos.phonetic.SpanishMetaphone.""" @@ -119,47 +120,47 @@ def test_spanish_metaphone(self): # tests from file:///home/chrislit/Downloads/ICTRS_2016_12.pdf # including of the modified version of the algorithm self.assertEqual(self.pa.encode('Caricia'), 'KRZ') - self.assertEqual(self.pa.encode('Caricia', modified=True), 'KRZ') + self.assertEqual(self.pa_mod.encode('Caricia'), 'KRZ') self.assertEqual(self.pa.encode('Llaves'), 'YVS') - self.assertEqual(self.pa.encode('Llaves', modified=True), 'YVZ') + self.assertEqual(self.pa_mod.encode('Llaves'), 'YVZ') self.assertEqual(self.pa.encode('Paella'), 'PY') - self.assertEqual(self.pa.encode('Paella', modified=True), 'PY') + self.assertEqual(self.pa_mod.encode('Paella'), 'PY') self.assertEqual(self.pa.encode('Cerilla'), 'ZRY') - self.assertEqual(self.pa.encode('Cerilla', modified=True), 'ZRY') + self.assertEqual(self.pa_mod.encode('Cerilla'), 'ZRY') self.assertEqual(self.pa.encode('Empeorar'), 'EMPRR') - self.assertEqual(self.pa.encode('Empeorar', modified=True), 'ENPRR') + self.assertEqual(self.pa_mod.encode('Empeorar'), 'ENPRR') self.assertEqual(self.pa.encode('Embotellar'), 'EMVTYR') - self.assertEqual(self.pa.encode('Embotellar', modified=True), 'ENVTYR') + self.assertEqual(self.pa_mod.encode('Embotellar'), 'ENVTYR') self.assertEqual(self.pa.encode('Hoy'), 'OY') - self.assertEqual(self.pa.encode('Hoy', modified=True), 'OY') + self.assertEqual(self.pa_mod.encode('Hoy'), 'OY') self.assertEqual(self.pa.encode('Xochimilco'), 'XXMLK') - self.assertEqual(self.pa.encode('Xochimilco', modified=True), 'XXMLK') + self.assertEqual(self.pa_mod.encode('Xochimilco'), 'XXMLK') self.assertEqual(self.pa.encode('Psiquiatra'), 'PSKTR') - self.assertEqual(self.pa.encode('Psiquiatra', modified=True), 'ZKTR') + self.assertEqual(self.pa_mod.encode('Psiquiatra'), 'ZKTR') self.assertEqual(self.pa.encode('siquiatra'), 'SKTR') - self.assertEqual(self.pa.encode('siquiatra', modified=True), 'ZKTR') + self.assertEqual(self.pa_mod.encode('siquiatra'), 'ZKTR') self.assertEqual(self.pa.encode('Obscuro'), 'OVSKR') - self.assertEqual(self.pa.encode('Obscuro', modified=True), 'OZKR') + self.assertEqual(self.pa_mod.encode('Obscuro'), 'OZKR') self.assertEqual(self.pa.encode('Oscuro'), 'OSKR') - self.assertEqual(self.pa.encode('Oscuro', modified=True), 'OZKR') + self.assertEqual(self.pa_mod.encode('Oscuro'), 'OZKR') self.assertEqual(self.pa.encode('Combate'), 'KMVT') - self.assertEqual(self.pa.encode('Combate', modified=True), 'KNVT') + self.assertEqual(self.pa_mod.encode('Combate'), 'KNVT') self.assertEqual(self.pa.encode('Convate'), 'KNVT') - self.assertEqual(self.pa.encode('Convate', modified=True), 'KNVT') + self.assertEqual(self.pa_mod.encode('Convate'), 'KNVT') self.assertEqual(self.pa.encode('Conbate'), 'KNVT') - self.assertEqual(self.pa.encode('Conbate', modified=True), 'KNVT') + self.assertEqual(self.pa_mod.encode('Conbate'), 'KNVT') self.assertEqual(self.pa.encode('Comportar'), 'KMPRTR') - self.assertEqual(self.pa.encode('Comportar', modified=True), 'KNPRTR') + self.assertEqual(self.pa_mod.encode('Comportar'), 'KNPRTR') self.assertEqual(self.pa.encode('Conportar'), 'KNPRTR') - self.assertEqual(self.pa.encode('Conportar', modified=True), 'KNPRTR') + self.assertEqual(self.pa_mod.encode('Conportar'), 'KNPRTR') self.assertEqual(self.pa.encode('Zapato'), 'ZPT') - self.assertEqual(self.pa.encode('Zapato', modified=True), 'ZPT') + self.assertEqual(self.pa_mod.encode('Zapato'), 'ZPT') self.assertEqual(self.pa.encode('Sapato'), 'SPT') - self.assertEqual(self.pa.encode('Sapato', modified=True), 'ZPT') + self.assertEqual(self.pa_mod.encode('Sapato'), 'ZPT') self.assertEqual(self.pa.encode('Escalera'), 'ESKLR') - self.assertEqual(self.pa.encode('Escalera', modified=True), 'EZKLR') + self.assertEqual(self.pa_mod.encode('Escalera'), 'EZKLR') self.assertEqual(self.pa.encode('scalera'), 'ESKLR') - self.assertEqual(self.pa.encode('scalera', modified=True), 'EZKLR') + self.assertEqual(self.pa_mod.encode('scalera'), 'EZKLR') # terms from algorithm/source self.assertEqual(self.pa.encode('acción'), 'AXN') diff --git a/tests/phonetic/test_phonetic_spfc.py b/tests/phonetic/test_phonetic_spfc.py index a94f08e87..f42f7be73 100644 --- a/tests/phonetic/test_phonetic_spfc.py +++ b/tests/phonetic/test_phonetic_spfc.py @@ -67,15 +67,21 @@ def test_spfc(self): self.assertRaises(AttributeError, self.pa.encode, 5) # etc. (for code coverage) - self.assertEqual(self.pa.encode('James Goldstein'), '78795') - self.assertEqual(self.pa.encode('James Hansen'), '58760') - self.assertEqual(self.pa.encode('James Hester'), '59700') + self.assertEqual(self.pa.encode('James Goldstein'), '77795') + self.assertEqual(self.pa.encode('James Hansen'), '57760') + self.assertEqual(self.pa.encode('James Hester'), '57700') self.assertEqual(self.pa.encode('James Bardot'), '31745') - self.assertEqual(self.pa.encode('James Windsor'), '29765') + self.assertEqual(self.pa.encode('James Windsor'), '27765') self.assertEqual(self.pa.encode('James Wenders'), '27760') self.assertEqual(self.pa.encode('James Ventor'), '17760') self.assertEqual(self.pa.encode('þ þ'), '00') + # encode_alpha + self.assertEqual(self.pa.encode_alpha('J. Kuhns'), 'CSGMS') + self.assertEqual(self.pa.encode_alpha('G. Altshuler'), 'ARGEG') + self.assertEqual(self.pa.encode_alpha('T. Vines'), 'CSGMS') + self.assertEqual(self.pa.encode_alpha('James Ventor'), 'CZGMS') + # Test wrapper self.assertEqual(spfc('G ALTSHULER'), '35797') diff --git a/tests/phonetic/test_phonetic_waahlin.py b/tests/phonetic/test_phonetic_waahlin.py new file mode 100644 index 000000000..8105838bf --- /dev/null +++ b/tests/phonetic/test_phonetic_waahlin.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.phonetic.test_phonetic_waahlin. + +This module contains unit tests for abydos.phonetic.Waahlin +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.phonetic import Soundex, Waahlin + + +class WaahlinTestCases(unittest.TestCase): + """Test Wåhlin functions. + + test cases for abydos.phonetic.Waahlin + """ + + pa = Waahlin() + pa_sdx = Waahlin(Soundex()) + + def test_waahlin(self): + """Test abydos.phonetic.Waahlin.""" + self.assertEqual(self.pa.encode(''), '') + + self.assertEqual(self.pa.encode('kjol'), '+OL') + self.assertEqual(self.pa.encode('stråken'), 'STRÅ+EN') + self.assertEqual(self.pa.encode('skytten'), '*YTTEN') + self.assertEqual(self.pa.encode('ljuden'), 'JUDEN') + self.assertEqual(self.pa.encode('högre'), 'HÖGRE') + self.assertEqual(self.pa.encode('först'), 'FÖRST') + self.assertEqual(self.pa.encode('hval'), 'VAL') + self.assertEqual(self.pa.encode('hrothgar'), 'ROTHGAR') + self.assertEqual(self.pa.encode('denna'), 'DENNA') + self.assertEqual(self.pa.encode('djur'), 'JUR') + self.assertEqual(self.pa.encode('hjärta'), 'JERTA') + self.assertEqual(self.pa.encode('STIEN'), '*EN') + self.assertEqual(self.pa.encode('SKJERN'), '*ERN') + self.assertEqual(self.pa.encode('HIELPA'), 'JELPA') + self.assertEqual(self.pa.encode('CEILA'), 'SEILA') + self.assertEqual(self.pa.encode('GELD'), 'JELD') + self.assertEqual(self.pa.encode('IERN'), 'JERN') + + # encode_alpha + self.assertEqual(self.pa.encode_alpha('kjol'), 'ÇOL') + self.assertEqual(self.pa.encode_alpha('stråken'), 'STRÅÇEN') + self.assertEqual(self.pa.encode_alpha('skytten'), 'ŠYTTEN') + self.assertEqual(self.pa.encode_alpha('ljuden'), 'JUDEN') + + def test_waahlin_soundex(self): + """Test abydos.phonetic.Waahlin with Soundex.""" + self.assertEqual(self.pa_sdx.encode(''), '') + + self.assertEqual(self.pa_sdx.encode('kjol'), '+O400') + self.assertEqual(self.pa_sdx.encode('stråken'), 'ST625') + self.assertEqual(self.pa_sdx.encode('skytten'), '*Y350') + self.assertEqual(self.pa_sdx.encode('ljuden'), 'JU350') + self.assertEqual(self.pa_sdx.encode('högre'), 'HO260') + self.assertEqual(self.pa_sdx.encode('först'), 'FO623') + self.assertEqual(self.pa_sdx.encode('hval'), 'VA400') + self.assertEqual(self.pa_sdx.encode('hrothgar'), 'RO326') + self.assertEqual(self.pa_sdx.encode('denna'), 'DE500') + self.assertEqual(self.pa_sdx.encode('djur'), 'JU600') + self.assertEqual(self.pa_sdx.encode('hjärta'), 'JA630') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/regression b/tests/regression index 9d3e812a8..37ef0ed3d 160000 --- a/tests/regression +++ b/tests/regression @@ -1 +1 @@ -Subproject commit 9d3e812a8f24d00a2afa0899b20d80b5c2f737ab +Subproject commit 37ef0ed3d96721b5bae2137ef871f684886d4f05 diff --git a/tests/stats/test_stats_confusion_table.py b/tests/stats/test_stats_confusion_table.py index f7cb6f211..7222e3351 100644 --- a/tests/stats/test_stats_confusion_table.py +++ b/tests/stats/test_stats_confusion_table.py @@ -144,6 +144,13 @@ def test_str(self): self.assertIsInstance(str(SCALE_TABLE), str) self.assertEqual(str(SCALE_TABLE), 'tp:1, tn:2, fp:3, fn:4') + def test_repr(self): + """Test abydos.stats.ConfusionTable._repr_.""" + self.assertIsInstance(repr(SCALE_TABLE), str) + self.assertEqual( + repr(SCALE_TABLE), 'ConfusionTable(tp=1, tn=2, fp=3, fn=4)' + ) + class PopulationTestCases(unittest.TestCase): """Test abydos.stats.ConfusionTable population methods.""" @@ -164,21 +171,21 @@ def test_error_pop(self): self.assertEqual(CATSNDOGS_TABLE.error_pop(), 5) self.assertEqual(WORKED_EG_TABLE.error_pop(), 190) - def test_test_pos_pop(self): - """Test abydos.stats.ConfusionTable.test_pos_pop.""" - self.assertEqual(UNIT_TABLE.test_pos_pop(), 2) - self.assertEqual(NULL_TABLE.test_pos_pop(), 0) - self.assertEqual(SCALE_TABLE.test_pos_pop(), 4) - self.assertEqual(CATSNDOGS_TABLE.test_pos_pop(), 7) - self.assertEqual(WORKED_EG_TABLE.test_pos_pop(), 200) - - def test_test_neg_pop(self): - """Test abydos.stats.ConfusionTable.test_neg_pop.""" - self.assertEqual(UNIT_TABLE.test_neg_pop(), 2) - self.assertEqual(NULL_TABLE.test_neg_pop(), 0) - self.assertEqual(SCALE_TABLE.test_neg_pop(), 6) - self.assertEqual(CATSNDOGS_TABLE.test_neg_pop(), 20) - self.assertEqual(WORKED_EG_TABLE.test_neg_pop(), 1830) + def test_pred_pos_pop(self): + """Test abydos.stats.ConfusionTable.pred_pos_pop.""" + self.assertEqual(UNIT_TABLE.pred_pos_pop(), 2) + self.assertEqual(NULL_TABLE.pred_pos_pop(), 0) + self.assertEqual(SCALE_TABLE.pred_pos_pop(), 4) + self.assertEqual(CATSNDOGS_TABLE.pred_pos_pop(), 7) + self.assertEqual(WORKED_EG_TABLE.pred_pos_pop(), 200) + + def test_pred_neg_pop(self): + """Test abydos.stats.ConfusionTable.pred_neg_pop.""" + self.assertEqual(UNIT_TABLE.pred_neg_pop(), 2) + self.assertEqual(NULL_TABLE.pred_neg_pop(), 0) + self.assertEqual(SCALE_TABLE.pred_neg_pop(), 6) + self.assertEqual(CATSNDOGS_TABLE.pred_neg_pop(), 20) + self.assertEqual(WORKED_EG_TABLE.pred_neg_pop(), 1830) def test_cond_pos_pop(self): """Test abydos.stats.ConfusionTable.cond_pos_pop.""" @@ -244,6 +251,14 @@ def test_specificity(self): self.assertAlmostEqual(CATSNDOGS_TABLE.specificity(), 17 / 19) self.assertAlmostEqual(WORKED_EG_TABLE.specificity(), 0.91) + def test_fnr(self): + """Test abydos.stats.ConfusionTable.fnr.""" + self.assertEqual(UNIT_TABLE.fnr(), 0.5) + self.assertTrue(isnan(NULL_TABLE.fnr())) + self.assertAlmostEqual(SCALE_TABLE.fnr(), 0.8) + self.assertAlmostEqual(CATSNDOGS_TABLE.fnr(), 3 / 8) + self.assertAlmostEqual(WORKED_EG_TABLE.fnr(), 1 / 3) + def test_npv(self): """Test abydos.stats.ConfusionTable.npv.""" self.assertEqual(UNIT_TABLE.npv(), 0.5) @@ -252,6 +267,16 @@ def test_npv(self): self.assertAlmostEqual(CATSNDOGS_TABLE.npv(), 17 / 20) self.assertAlmostEqual(WORKED_EG_TABLE.npv(), 182 / 183) + def test_false_omission_rate(self): + """Test abydos.stats.ConfusionTable.false_omission_rate.""" + self.assertEqual(UNIT_TABLE.false_omission_rate(), 0.5) + self.assertTrue(isnan(NULL_TABLE.false_omission_rate())) + self.assertAlmostEqual(SCALE_TABLE.false_omission_rate(), 2 / 3) + self.assertAlmostEqual(CATSNDOGS_TABLE.false_omission_rate(), 3 / 20) + self.assertAlmostEqual( + WORKED_EG_TABLE.false_omission_rate(), 10 / 1830 + ) + def test_fallout(self): """Test abydos.stats.ConfusionTable.fallout.""" self.assertEqual(UNIT_TABLE.fallout(), 0.5) @@ -260,6 +285,38 @@ def test_fallout(self): self.assertAlmostEqual(CATSNDOGS_TABLE.fallout(), 2 / 19) self.assertAlmostEqual(WORKED_EG_TABLE.fallout(), 0.09) + def test_pos_likelihood_ratio(self): + """Test abydos.stats.ConfusionTable.pos_likelihood_ratio.""" + self.assertEqual(UNIT_TABLE.pos_likelihood_ratio(), 1.0) + self.assertTrue(isnan(NULL_TABLE.pos_likelihood_ratio())) + self.assertAlmostEqual(SCALE_TABLE.pos_likelihood_ratio(), 1 / 3) + self.assertAlmostEqual(CATSNDOGS_TABLE.pos_likelihood_ratio(), 5.9375) + self.assertAlmostEqual( + WORKED_EG_TABLE.pos_likelihood_ratio(), 7.407407407407409 + ) + + def test_neg_likelihood_ratio(self): + """Test abydos.stats.ConfusionTable.neg_likelihood_ratio.""" + self.assertEqual(UNIT_TABLE.neg_likelihood_ratio(), 1.0) + self.assertTrue(isnan(NULL_TABLE.neg_likelihood_ratio())) + self.assertAlmostEqual(SCALE_TABLE.neg_likelihood_ratio(), 2.0) + self.assertAlmostEqual( + CATSNDOGS_TABLE.neg_likelihood_ratio(), 0.41911764705882354 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.neg_likelihood_ratio(), 0.36630036630036633 + ) + + def test_diagnostic_odds_ratio(self): + """Test abydos.stats.ConfusionTable.diagnostic_odds_ratio.""" + self.assertEqual(UNIT_TABLE.diagnostic_odds_ratio(), 1.0) + self.assertTrue(isnan(NULL_TABLE.diagnostic_odds_ratio())) + self.assertAlmostEqual(SCALE_TABLE.diagnostic_odds_ratio(), 1 / 6) + self.assertAlmostEqual(CATSNDOGS_TABLE.diagnostic_odds_ratio(), 85 / 6) + self.assertAlmostEqual( + WORKED_EG_TABLE.diagnostic_odds_ratio(), 20.22222222222222 + ) + def test_fdr(self): """Test abydos.stats.ConfusionTable.fdr.""" self.assertEqual(UNIT_TABLE.fdr(), 0.5) @@ -301,6 +358,22 @@ def test_balanced_accuracy(self): self.assertAlmostEqual(CATSNDOGS_TABLE.balanced_accuracy(), 231 / 304) self.assertAlmostEqual(WORKED_EG_TABLE.balanced_accuracy(), 473 / 600) + def test_error_rate(self): + """Test abydos.stats.ConfusionTable.error_rate.""" + self.assertEqual(UNIT_TABLE.error_rate(), 0.5) + self.assertTrue(isnan(NULL_TABLE.error_rate())) + self.assertAlmostEqual(SCALE_TABLE.error_rate(), 0.7) + self.assertAlmostEqual(CATSNDOGS_TABLE.error_rate(), 5 / 27) + self.assertAlmostEqual(WORKED_EG_TABLE.error_rate(), 190 / 2030) + + def test_prevalence(self): + """Test abydos.stats.ConfusionTable.prevalence.""" + self.assertEqual(UNIT_TABLE.prevalence(), 0.5) + self.assertTrue(isnan(NULL_TABLE.prevalence())) + self.assertAlmostEqual(SCALE_TABLE.prevalence(), 0.5) + self.assertAlmostEqual(CATSNDOGS_TABLE.prevalence(), 8 / 27) + self.assertAlmostEqual(WORKED_EG_TABLE.prevalence(), 30 / 2030) + def test_informedness(self): """Test abydos.stats.ConfusionTable.informedness.""" self.assertEqual(UNIT_TABLE.informedness(), 0) @@ -610,6 +683,14 @@ def test_f_measure(self): self.assertAlmostEqual(CATSNDOGS_TABLE.f_measure(), 2 / 3) self.assertAlmostEqual(WORKED_EG_TABLE.f_measure(), 4 / 23) + def test_jaccard(self): + """Test abydos.stats.ConfusionTable.jaccard.""" + self.assertEqual(UNIT_TABLE.jaccard(), 1 / 3) + self.assertTrue(isnan(NULL_TABLE.jaccard())) + self.assertAlmostEqual(SCALE_TABLE.jaccard(), 1 / 8) + self.assertAlmostEqual(CATSNDOGS_TABLE.jaccard(), 0.5) + self.assertAlmostEqual(WORKED_EG_TABLE.jaccard(), 20 / 210) + def test_g_measure(self): """Test abydos.stats.ConfusionTable.g_measure.""" self.assertEqual(UNIT_TABLE.g_measure(), 0.5) @@ -622,6 +703,14 @@ def test_g_measure(self): WORKED_EG_TABLE.g_measure(), 0.25819888974716115 ) + def test_d_measure(self): + """Test abydos.stats.ConfusionTable.d_measure.""" + self.assertAlmostEqual(UNIT_TABLE.d_measure(), 2 / 3) + self.assertTrue(isnan(NULL_TABLE.d_measure())) + self.assertAlmostEqual(SCALE_TABLE.d_measure(), 7 / 8) + self.assertAlmostEqual(CATSNDOGS_TABLE.d_measure(), 0.5) + self.assertAlmostEqual(WORKED_EG_TABLE.d_measure(), 0.9047619047619048) + def test_mcc(self): """Test abydos.stats.ConfusionTable.mcc.""" self.assertEqual(UNIT_TABLE.mcc(), 0) @@ -664,6 +753,114 @@ def _quick_kappa(acc, racc): _quick_kappa((184 / 203), (((2000 * 1830) + 6000) / 2030 ** 2)), ) + def test_phi_coefficient(self): + """Test abydos.stats.ConfusionTable.phi_coefficient.""" + self.assertEqual(UNIT_TABLE.phi_coefficient(), 0.0) + self.assertTrue(isnan(NULL_TABLE.phi_coefficient())) + self.assertAlmostEqual( + SCALE_TABLE.phi_coefficient(), -0.408248290463863 + ) + self.assertAlmostEqual( + CATSNDOGS_TABLE.phi_coefficient(), 0.5415533908932432 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.phi_coefficient(), 0.23348550853492078 + ) + + def test_joint_entropy(self): + """Test abydos.stats.ConfusionTable.joint_entropy.""" + self.assertEqual(UNIT_TABLE.joint_entropy(), 1.3862943611198906) + self.assertTrue(isnan(NULL_TABLE.joint_entropy())) + self.assertAlmostEqual(SCALE_TABLE.joint_entropy(), 1.2798542258336676) + self.assertAlmostEqual( + CATSNDOGS_TABLE.joint_entropy(), 1.040505471995055 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.joint_entropy(), 0.38442665366628237 + ) + + def test_actual_entropy(self): + """Test abydos.stats.ConfusionTable.actual_entropy.""" + self.assertEqual(UNIT_TABLE.actual_entropy(), 0.6931471805599453) + self.assertTrue(isnan(NULL_TABLE.actual_entropy())) + self.assertAlmostEqual( + SCALE_TABLE.actual_entropy(), 0.6931471805599456 + ) + self.assertAlmostEqual( + CATSNDOGS_TABLE.actual_entropy(), 0.6076934238709568 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.actual_entropy(), 0.07695321955601564 + ) + + def test_predicted_entropy(self): + """Test abydos.stats.ConfusionTable.predicted_entropy.""" + self.assertEqual(UNIT_TABLE.predicted_entropy(), 0.6931471805599453) + self.assertTrue(isnan(NULL_TABLE.predicted_entropy())) + self.assertAlmostEqual( + SCALE_TABLE.predicted_entropy(), 0.6730116670092565 + ) + self.assertAlmostEqual( + CATSNDOGS_TABLE.predicted_entropy(), 0.5722806988018472 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.predicted_entropy(), 0.3218236566720343 + ) + + def test_mutual_information(self): + """Test abydos.stats.ConfusionTable.mutual_information.""" + self.assertEqual(UNIT_TABLE.mutual_information(), 0.0) + self.assertTrue(isnan(NULL_TABLE.mutual_information())) + self.assertAlmostEqual( + SCALE_TABLE.mutual_information(), 0.08630462173553424 + ) + self.assertAlmostEqual( + CATSNDOGS_TABLE.mutual_information(), 0.13946865067774858 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.mutual_information(), 0.014350222561768025 + ) + + def test_proficiency(self): + """Test abydos.stats.ConfusionTable.proficiency.""" + self.assertEqual(UNIT_TABLE.proficiency(), 0.0) + self.assertTrue(isnan(NULL_TABLE.proficiency())) + self.assertAlmostEqual(SCALE_TABLE.proficiency(), 0.12451124978365304) + self.assertAlmostEqual( + CATSNDOGS_TABLE.proficiency(), 0.229504952989856 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.proficiency(), 0.1864798203968872 + ) + + def test_igr(self): + """Test abydos.stats.ConfusionTable.igr.""" + self.assertEqual(UNIT_TABLE.igr(), 0.0) + self.assertTrue(isnan(NULL_TABLE.igr())) + self.assertAlmostEqual(SCALE_TABLE.igr(), 0.12823644219877575) + self.assertAlmostEqual(CATSNDOGS_TABLE.igr(), 0.24370671764703314) + self.assertAlmostEqual(WORKED_EG_TABLE.igr(), 0.044590328474180894) + + def test_dependency(self): + """Test abydos.stats.ConfusionTable.dependency.""" + self.assertEqual(UNIT_TABLE.dependency(), 0.0) + self.assertTrue(isnan(NULL_TABLE.dependency())) + self.assertAlmostEqual(SCALE_TABLE.dependency(), 0.06743316542891234) + self.assertAlmostEqual( + CATSNDOGS_TABLE.dependency(), 0.13403932457013681 + ) + self.assertAlmostEqual( + WORKED_EG_TABLE.dependency(), 0.03732889596730547 + ) + + def test_lift(self): + """Test abydos.stats.ConfusionTable.lift.""" + self.assertEqual(UNIT_TABLE.lift(), 1.0) + self.assertTrue(isnan(NULL_TABLE.lift())) + self.assertAlmostEqual(SCALE_TABLE.lift(), 0.5) + self.assertAlmostEqual(CATSNDOGS_TABLE.lift(), 2.4107142857142856) + self.assertAlmostEqual(WORKED_EG_TABLE.lift(), 6.76666666666666) + if __name__ == '__main__': unittest.main() diff --git a/tests/stats/test_stats_mean.py b/tests/stats/test_stats_mean.py index f86a6b6f0..65026b65a 100644 --- a/tests/stats/test_stats_mean.py +++ b/tests/stats/test_stats_mean.py @@ -101,7 +101,7 @@ def test_means_hmean(self): self.assertEqual(hmean([5, 5, 5, 5, 5]), 5) self.assertEqual(hmean([0]), 0) self.assertEqual(hmean([8]), 8) - self.assertRaises(AttributeError, hmean, ([])) + self.assertRaises(ValueError, hmean, ([])) def test_means_qmean(self): """Test abydos.stats.qmean.""" @@ -122,16 +122,20 @@ def test_means_lmean(self): """Test abydos.stats.lmean.""" self.assertAlmostEqual(lmean(self._one_to_five), 2.6739681320855766) self.assertAlmostEqual(lmean(self._floats), 0.301387278840469) - self.assertRaises(AttributeError, lmean, (1, 1)) - self.assertRaises(AttributeError, lmean, (0.15, 0.15)) + self.assertEqual(lmean([1, 1]), 1.0) + self.assertEqual(lmean([2, 2]), 2.0) + self.assertEqual(lmean([2, 0]), 0.0) + self.assertAlmostEqual(lmean([1, 2]), 1.4426950408889634) + self.assertRaises(ValueError, lmean, (1, 1, 1)) + self.assertRaises(ValueError, lmean, (0.15, 0.15, 1)) def test_means_imean(self): """Test abydos.stats.imean.""" - self.assertRaises(AttributeError, imean, self._ones) - self.assertRaises(AttributeError, imean, self._zeros) - self.assertRaises(AttributeError, imean, self._one_to_five) - self.assertRaises(AttributeError, imean, self._onethreefive) - self.assertRaises(AttributeError, imean, self._floats) + self.assertRaises(ValueError, imean, self._ones) + self.assertRaises(ValueError, imean, self._zeros) + self.assertRaises(ValueError, imean, self._one_to_five) + self.assertRaises(ValueError, imean, self._onethreefive) + self.assertRaises(ValueError, imean, self._floats) self.assertAlmostEqual(imean(self._2ones), 1) self.assertTrue(isnan(imean(self._2zeros))) self.assertAlmostEqual(imean(self._onetwo), 1.4715177646857693) @@ -141,17 +145,18 @@ def test_means_imean(self): def test_means_seiffert_mean(self): """Test abydos.stats.seiffert_mean.""" - self.assertRaises(AttributeError, seiffert_mean, self._ones) - self.assertRaises(AttributeError, seiffert_mean, self._zeros) - self.assertRaises(AttributeError, seiffert_mean, self._one_to_five) - self.assertRaises(AttributeError, seiffert_mean, self._onethreefive) - self.assertRaises(AttributeError, seiffert_mean, self._floats) + self.assertRaises(ValueError, seiffert_mean, self._ones) + self.assertRaises(ValueError, seiffert_mean, self._zeros) + self.assertRaises(ValueError, seiffert_mean, self._one_to_five) + self.assertRaises(ValueError, seiffert_mean, self._onethreefive) + self.assertRaises(ValueError, seiffert_mean, self._floats) self.assertAlmostEqual(seiffert_mean(self._onetwo), 1.4712939827611637) self.assertAlmostEqual( seiffert_mean(self._2floats), 0.36782349569029094 ) self.assertEqual(seiffert_mean([1]), 1) self.assertEqual(seiffert_mean([0.05]), 0.05) + self.assertTrue(isnan(seiffert_mean([1, 1]))) def test_means_lehmer_mean(self): """Test abydos.stats.lehmer_mean.""" @@ -189,6 +194,9 @@ def test_means_hoelder_mean(self): hoelder_mean(self._onethreefive), 3.492849839314596 ) self.assertAlmostEqual(hoelder_mean(self._floats), 0.4477722635447623) + self.assertAlmostEqual( + hoelder_mean(self._floats, 0), gmean(self._floats) + ) def test_means_agmean(self): """Test abydos.stats.agmean.""" @@ -197,6 +205,7 @@ def test_means_agmean(self): self.assertAlmostEqual(agmean(self._one_to_five), 2.799103662640505) self.assertAlmostEqual(agmean(self._onethreefive), 2.6764865062631356) self.assertAlmostEqual(agmean(self._floats), 0.32800436242611486) + self.assertTrue(isnan(agmean(self._has_inf))) def test_means_ghmean(self): """Test abydos.stats.ghmean.""" @@ -204,6 +213,7 @@ def test_means_ghmean(self): self.assertAlmostEqual(ghmean(self._one_to_five), 2.3839666656453167) self.assertAlmostEqual(ghmean(self._onethreefive), 2.0740491019412035) self.assertAlmostEqual(ghmean(self._floats), 0.2536468771476393) + self.assertTrue(isnan(ghmean(self._has_inf))) def test_means_aghmean(self): """Test abydos.stats.aghmean.""" diff --git a/tests/stats/test_stats_pairwise.py b/tests/stats/test_stats_pairwise.py index be994b92f..39a55fc06 100644 --- a/tests/stats/test_stats_pairwise.py +++ b/tests/stats/test_stats_pairwise.py @@ -52,7 +52,7 @@ 'Nel', 'Kneale', 'Uí Néill', - 'O\'Neill', + "O'Neill", 'MacNeil', 'MacNele', 'Niall Noígíallach', @@ -70,7 +70,7 @@ 'Nigelli', 'Nel', 'Kneale', - 'O\'Neill', + "O'Neill", 'MacNeil', 'MacNele', ) diff --git a/tests/stemmer/test_stemmer__snowball.py b/tests/stemmer/test_stemmer__snowball.py index 9b88c8987..6cd1377bc 100644 --- a/tests/stemmer/test_stemmer__snowball.py +++ b/tests/stemmer/test_stemmer__snowball.py @@ -86,6 +86,13 @@ def test_sb_r1(self): self.assertEqual(self.stmr._sb_r1('sprinkled'), 5) # noqa: SF01 self.assertEqual(self.stmr._sb_r1('eucharist'), 3) # noqa: SF01 + self.assertEqual( + self.stmr._sb_r1('eucharist', r1_prefixes={'ist'}), 3 # noqa: SF01 + ) + self.assertEqual( + self.stmr._sb_r1('ist', r1_prefixes={'ist'}), 3 # noqa: SF01 + ) + def test_sb_r2(self): """Test abydos.stemmer._Snowball._sb_r2.""" # base case @@ -146,6 +153,9 @@ def test_sb_ends_in_short_syllable(self): self.assertFalse( self.stmr._sb_ends_in_short_syllable('a') # noqa: SF01 ) + self.assertFalse( + self.stmr._sb_ends_in_short_syllable('da') # noqa: SF01 + ) def test_sb_short_word(self): """Test abydos.stemmer._Snowball._sb_short_word.""" diff --git a/tests/stemmer/test_stemmer_porter.py b/tests/stemmer/test_stemmer_porter.py index 449408288..597c7be82 100644 --- a/tests/stemmer/test_stemmer_porter.py +++ b/tests/stemmer/test_stemmer_porter.py @@ -44,6 +44,8 @@ class PorterTestCases(unittest.TestCase): stmr = Porter() stmr._vowels = set('aeiouy') # noqa: SF01 + stmr_ee = Porter(early_english=True) + stmr_ee._vowels = set('aeiouy') # noqa: SF01 def test_m_degree(self): """Test abydos.stemmer.Porter._m_degree.""" @@ -202,30 +204,30 @@ def test_porter(self): def test_porter_early_english(self): """Test abydos.stemmer.Porter (early English).""" # base case - self.assertEqual(self.stmr.stem('', early_english=True), '') + self.assertEqual(self.stmr_ee.stem(''), '') # simple cases (no different from regular stemmer) - self.assertEqual(self.stmr.stem('c', early_english=True), 'c') - self.assertEqual(self.stmr.stem('da', early_english=True), 'da') - self.assertEqual(self.stmr.stem('ad', early_english=True), 'ad') - self.assertEqual(self.stmr.stem('sing', early_english=True), 'sing') - self.assertEqual(self.stmr.stem('singing', early_english=True), 'sing') + self.assertEqual(self.stmr_ee.stem('c'), 'c') + self.assertEqual(self.stmr_ee.stem('da'), 'da') + self.assertEqual(self.stmr_ee.stem('ad'), 'ad') + self.assertEqual(self.stmr_ee.stem('sing'), 'sing') + self.assertEqual(self.stmr_ee.stem('singing'), 'sing') # make - self.assertEqual(self.stmr.stem('make', early_english=True), 'make') - self.assertEqual(self.stmr.stem('makes', early_english=True), 'make') - self.assertEqual(self.stmr.stem('maketh', early_english=True), 'make') - self.assertEqual(self.stmr.stem('makest', early_english=True), 'make') + self.assertEqual(self.stmr_ee.stem('make'), 'make') + self.assertEqual(self.stmr_ee.stem('makes'), 'make') + self.assertEqual(self.stmr_ee.stem('maketh'), 'make') + self.assertEqual(self.stmr_ee.stem('makest'), 'make') # say - self.assertEqual(self.stmr.stem('say', early_english=True), 'sai') - self.assertEqual(self.stmr.stem('says', early_english=True), 'sai') - self.assertEqual(self.stmr.stem('sayeth', early_english=True), 'sai') - self.assertEqual(self.stmr.stem('sayest', early_english=True), 'sai') + self.assertEqual(self.stmr_ee.stem('say'), 'sai') + self.assertEqual(self.stmr_ee.stem('says'), 'sai') + self.assertEqual(self.stmr_ee.stem('sayeth'), 'sai') + self.assertEqual(self.stmr_ee.stem('sayest'), 'sai') # missed branch test cases - self.assertEqual(self.stmr.stem('best', early_english=True), 'best') - self.assertEqual(self.stmr.stem('meth', early_english=True), 'meth') + self.assertEqual(self.stmr_ee.stem('best'), 'best') + self.assertEqual(self.stmr_ee.stem('meth'), 'meth') def test_porter_snowball(self): """Test abydos.stemmer.Porter (Snowball testset). diff --git a/tests/stemmer/test_stemmer_porter2.py b/tests/stemmer/test_stemmer_porter2.py index 4aca06ff1..f4ff846bc 100644 --- a/tests/stemmer/test_stemmer_porter2.py +++ b/tests/stemmer/test_stemmer_porter2.py @@ -43,6 +43,9 @@ class Porter2TestCases(unittest.TestCase): """ stmr = Porter2() + stmr._vowels = set('aeiouy') # noqa: SF01 + stmr_ee = Porter2(early_english=True) + stmr_ee._vowels = set('aeiouy') # noqa: SF01 def test_porter2(self): """Test abydos.stemmer.Porter2.""" @@ -59,8 +62,8 @@ def test_porter2(self): # missed branch test cases self.assertEqual(self.stmr.stem('capitalism'), 'capit') self.assertEqual(self.stmr.stem('fatalism'), 'fatal') - self.assertEqual(self.stmr.stem('dog\'s'), 'dog') - self.assertEqual(self.stmr.stem('A\'s\''), 'a') + self.assertEqual(self.stmr.stem("dog's"), 'dog') + self.assertEqual(self.stmr.stem("A's'"), 'a') self.assertEqual(self.stmr.stem('agreedly'), 'agre') self.assertEqual(self.stmr.stem('feedly'), 'feed') self.assertEqual(self.stmr.stem('stional'), 'stional') @@ -79,30 +82,30 @@ def test_porter2(self): def test_porter2_early_english(self): """Test abydos.stemmer.Porter2 (early English).""" # base case - self.assertEqual(self.stmr.stem('', early_english=True), '') + self.assertEqual(self.stmr_ee.stem(''), '') # simple cases (no different from regular stemmer) - self.assertEqual(self.stmr.stem('c', early_english=True), 'c') - self.assertEqual(self.stmr.stem('da', early_english=True), 'da') - self.assertEqual(self.stmr.stem('ad', early_english=True), 'ad') - self.assertEqual(self.stmr.stem('sing', early_english=True), 'sing') - self.assertEqual(self.stmr.stem('singing', early_english=True), 'sing') + self.assertEqual(self.stmr_ee.stem('c'), 'c') + self.assertEqual(self.stmr_ee.stem('da'), 'da') + self.assertEqual(self.stmr_ee.stem('ad'), 'ad') + self.assertEqual(self.stmr_ee.stem('sing'), 'sing') + self.assertEqual(self.stmr_ee.stem('singing'), 'sing') # make - self.assertEqual(self.stmr.stem('make', early_english=True), 'make') - self.assertEqual(self.stmr.stem('makes', early_english=True), 'make') - self.assertEqual(self.stmr.stem('maketh', early_english=True), 'make') - self.assertEqual(self.stmr.stem('makest', early_english=True), 'make') + self.assertEqual(self.stmr_ee.stem('make'), 'make') + self.assertEqual(self.stmr_ee.stem('makes'), 'make') + self.assertEqual(self.stmr_ee.stem('maketh'), 'make') + self.assertEqual(self.stmr_ee.stem('makest'), 'make') # say - self.assertEqual(self.stmr.stem('say', early_english=True), 'say') - self.assertEqual(self.stmr.stem('says', early_english=True), 'say') - self.assertEqual(self.stmr.stem('sayeth', early_english=True), 'say') - self.assertEqual(self.stmr.stem('sayest', early_english=True), 'say') + self.assertEqual(self.stmr_ee.stem('say'), 'say') + self.assertEqual(self.stmr_ee.stem('says'), 'say') + self.assertEqual(self.stmr_ee.stem('sayeth'), 'say') + self.assertEqual(self.stmr_ee.stem('sayest'), 'say') # missed branch test cases - self.assertEqual(self.stmr.stem('best', early_english=True), 'best') - self.assertEqual(self.stmr.stem('meth', early_english=True), 'meth') + self.assertEqual(self.stmr_ee.stem('best'), 'best') + self.assertEqual(self.stmr_ee.stem('meth'), 'meth') def test_porter2_snowball(self): """Test abydos.stemmer.Porter2 (Snowball testset). diff --git a/tests/stemmer/test_stemmer_snowball_german.py b/tests/stemmer/test_stemmer_snowball_german.py index 5cfdfef25..1ace91ccd 100644 --- a/tests/stemmer/test_stemmer_snowball_german.py +++ b/tests/stemmer/test_stemmer_snowball_german.py @@ -43,6 +43,7 @@ class SnowballGermanTestCases(unittest.TestCase): """ stmr = SnowballGerman() + stmr_av = SnowballGerman(alternate_vowels=True) def test_snowball_german(self): """Test abydos.stemmer.SnowballGerman (Snowball testset). @@ -73,67 +74,47 @@ def test_snowball_german(self): def test_sb_german_snowball_alt(self): """Test abydos.stemmer.SnowballGerman (alternate vowels).""" # base case - self.assertEqual(self.stmr.stem('', alternate_vowels=True), '') + self.assertEqual(self.stmr_av.stem(''), '') # dämmerung,dammer - self.assertEqual( - self.stmr.stem('dämmerung', alternate_vowels=True), 'dammer' - ) - self.assertEqual( - self.stmr.stem('daemmerung', alternate_vowels=True), 'dammer' - ) + self.assertEqual(self.stmr_av.stem('dämmerung'), 'dammer') + self.assertEqual(self.stmr_av.stem('daemmerung'), 'dammer') self.assertEqual(self.stmr.stem('dämmerung'), 'dammer') self.assertEqual(self.stmr.stem('daemmerung'), 'daemmer') # brötchen,brotch - self.assertEqual( - self.stmr.stem('brötchen', alternate_vowels=True), 'brotch' - ) - self.assertEqual( - self.stmr.stem('broetchen', alternate_vowels=True), 'brotch' - ) + self.assertEqual(self.stmr_av.stem('brötchen'), 'brotch') + self.assertEqual(self.stmr_av.stem('broetchen'), 'brotch') self.assertEqual(self.stmr.stem('brötchen'), 'brotch') self.assertEqual(self.stmr.stem('broetchen'), 'broetch') # büro,buro - self.assertEqual(self.stmr.stem('büro', alternate_vowels=True), 'buro') - self.assertEqual( - self.stmr.stem('buero', alternate_vowels=True), 'buro' - ) + self.assertEqual(self.stmr_av.stem('büro'), 'buro') + self.assertEqual(self.stmr_av.stem('buero'), 'buro') self.assertEqual(self.stmr.stem('büro'), 'buro') self.assertEqual(self.stmr.stem('buero'), 'buero') # häufen,hauf - self.assertEqual( - self.stmr.stem('häufen', alternate_vowels=True), 'hauf' - ) - self.assertEqual( - self.stmr.stem('haeufen', alternate_vowels=True), 'hauf' - ) + self.assertEqual(self.stmr_av.stem('häufen'), 'hauf') + self.assertEqual(self.stmr_av.stem('haeufen'), 'hauf') self.assertEqual(self.stmr.stem('häufen'), 'hauf') self.assertEqual(self.stmr.stem('haeufen'), 'haeuf') # quelle,quell - self.assertEqual( - self.stmr.stem('qülle', alternate_vowels=True), 'qull' - ) - self.assertEqual( - self.stmr.stem('quelle', alternate_vowels=True), 'quell' - ) + self.assertEqual(self.stmr_av.stem('qülle'), 'qull') + self.assertEqual(self.stmr_av.stem('quelle'), 'quell') self.assertEqual(self.stmr.stem('qülle'), 'qull') self.assertEqual(self.stmr.stem('quelle'), 'quell') # feuer,feuer - self.assertEqual(self.stmr.stem('feür', alternate_vowels=True), 'feur') - self.assertEqual(self.stmr.stem('feuer', alternate_vowels=True), 'feu') + self.assertEqual(self.stmr_av.stem('feür'), 'feur') + self.assertEqual(self.stmr_av.stem('feuer'), 'feu') self.assertEqual(self.stmr.stem('feür'), 'feur') self.assertEqual(self.stmr.stem('feuer'), 'feu') # über,uber - self.assertEqual(self.stmr.stem('über', alternate_vowels=True), 'uber') - self.assertEqual( - self.stmr.stem('ueber', alternate_vowels=True), 'uber' - ) + self.assertEqual(self.stmr_av.stem('über'), 'uber') + self.assertEqual(self.stmr_av.stem('ueber'), 'uber') self.assertEqual(self.stmr.stem('über'), 'uber') self.assertEqual(self.stmr.stem('ueber'), 'ueb') diff --git a/tests/stemmer/test_stemmer_uealite.py b/tests/stemmer/test_stemmer_uealite.py index 64d435329..af3a9f84f 100644 --- a/tests/stemmer/test_stemmer_uealite.py +++ b/tests/stemmer/test_stemmer_uealite.py @@ -42,6 +42,7 @@ class UEALiteTestCases(unittest.TestCase): """ stmr = UEALite() + stmr_adams = UEALite(var='Adams') def test_uealite(self): """Test abydos.stemmer.UEALite.""" @@ -109,70 +110,70 @@ def test_uealite(self): # test cases copied from Ruby port # https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb # stem base words to just the base word - self.assertEqual(self.stmr.stem('man', var='Adams'), 'man') - self.assertEqual(self.stmr.stem('happiness', var='Adams'), 'happiness') + self.assertEqual(self.stmr_adams.stem('man'), 'man') + self.assertEqual(self.stmr_adams.stem('happiness'), 'happiness') # stem theses as thesis but not bases as basis - self.assertEqual(self.stmr.stem('theses', var='Adams'), 'thesis') - self.assertNotEqual(self.stmr.stem('bases', var='Adams'), 'basis') + self.assertEqual(self.stmr_adams.stem('theses'), 'thesis') + self.assertNotEqual(self.stmr_adams.stem('bases'), 'basis') # stem preterite words ending in -ed without the -ed - self.assertEqual(self.stmr.stem('ordained', var='Adams'), 'ordain') - self.assertEqual(self.stmr.stem('killed', var='Adams'), 'kill') - self.assertEqual(self.stmr.stem('liked', var='Adams'), 'like') - self.assertEqual(self.stmr.stem('helped', var='Adams'), 'help') - # self.assertEqual(self.stmr.stem('scarred', var='Adams'), 'scar') - self.assertEqual(self.stmr.stem('invited', var='Adams'), 'invite') - self.assertEqual(self.stmr.stem('exited', var='Adams'), 'exit') - self.assertEqual(self.stmr.stem('debited', var='Adams'), 'debit') - self.assertEqual(self.stmr.stem('smited', var='Adams'), 'smite') + self.assertEqual(self.stmr_adams.stem('ordained'), 'ordain') + self.assertEqual(self.stmr_adams.stem('killed'), 'kill') + self.assertEqual(self.stmr_adams.stem('liked'), 'like') + self.assertEqual(self.stmr_adams.stem('helped'), 'help') + # self.assertEqual(self.stmr_adams.stem('scarred'), 'scar') + self.assertEqual(self.stmr_adams.stem('invited'), 'invite') + self.assertEqual(self.stmr_adams.stem('exited'), 'exit') + self.assertEqual(self.stmr_adams.stem('debited'), 'debit') + self.assertEqual(self.stmr_adams.stem('smited'), 'smite') # stem progressive verbs and gerunds without the -ing - self.assertEqual(self.stmr.stem('running', var='Adams'), 'run') - self.assertEqual(self.stmr.stem('settings', var='Adams'), 'set') - self.assertEqual(self.stmr.stem('timing', var='Adams'), 'time') - self.assertEqual(self.stmr.stem('dying', var='Adams'), 'die') - self.assertEqual(self.stmr.stem('harping', var='Adams'), 'harp') - self.assertEqual(self.stmr.stem('charring', var='Adams'), 'char') + self.assertEqual(self.stmr_adams.stem('running'), 'run') + self.assertEqual(self.stmr_adams.stem('settings'), 'set') + self.assertEqual(self.stmr_adams.stem('timing'), 'time') + self.assertEqual(self.stmr_adams.stem('dying'), 'die') + self.assertEqual(self.stmr_adams.stem('harping'), 'harp') + self.assertEqual(self.stmr_adams.stem('charring'), 'char') # not stem false progressive verbs such as 'sing' - self.assertEqual(self.stmr.stem('ring', var='Adams'), 'ring') - self.assertEqual(self.stmr.stem('sing', var='Adams'), 'sing') - self.assertEqual(self.stmr.stem('ring', var='Adams'), 'ring') - self.assertEqual(self.stmr.stem('bring', var='Adams'), 'bring') - self.assertEqual(self.stmr.stem('fling', var='Adams'), 'fling') + self.assertEqual(self.stmr_adams.stem('ring'), 'ring') + self.assertEqual(self.stmr_adams.stem('sing'), 'sing') + self.assertEqual(self.stmr_adams.stem('ring'), 'ring') + self.assertEqual(self.stmr_adams.stem('bring'), 'bring') + self.assertEqual(self.stmr_adams.stem('fling'), 'fling') # stem various plural nouns and 3rd-pres verbs without the -s/-es - self.assertEqual(self.stmr.stem('changes', var='Adams'), 'change') - self.assertEqual(self.stmr.stem('deaths', var='Adams'), 'death') - self.assertEqual(self.stmr.stem('shadows', var='Adams'), 'shadow') - self.assertEqual(self.stmr.stem('flies', var='Adams'), 'fly') - self.assertEqual(self.stmr.stem('things', var='Adams'), 'thing') - self.assertEqual(self.stmr.stem('nothings', var='Adams'), 'nothing') - self.assertEqual(self.stmr.stem('witches', var='Adams'), 'witch') - self.assertEqual(self.stmr.stem('makes', var='Adams'), 'make') - self.assertEqual(self.stmr.stem('smokes', var='Adams'), 'smoke') - self.assertEqual(self.stmr.stem('does', var='Adams'), 'do') + self.assertEqual(self.stmr_adams.stem('changes'), 'change') + self.assertEqual(self.stmr_adams.stem('deaths'), 'death') + self.assertEqual(self.stmr_adams.stem('shadows'), 'shadow') + self.assertEqual(self.stmr_adams.stem('flies'), 'fly') + self.assertEqual(self.stmr_adams.stem('things'), 'thing') + self.assertEqual(self.stmr_adams.stem('nothings'), 'nothing') + self.assertEqual(self.stmr_adams.stem('witches'), 'witch') + self.assertEqual(self.stmr_adams.stem('makes'), 'make') + self.assertEqual(self.stmr_adams.stem('smokes'), 'smoke') + self.assertEqual(self.stmr_adams.stem('does'), 'do') # stem various words with -des suffix - self.assertEqual(self.stmr.stem('abodes', var='Adams'), 'abode') - self.assertEqual(self.stmr.stem('escapades', var='Adams'), 'escapade') - self.assertEqual(self.stmr.stem('crusades', var='Adams'), 'crusade') - self.assertEqual(self.stmr.stem('grades', var='Adams'), 'grade') + self.assertEqual(self.stmr_adams.stem('abodes'), 'abode') + self.assertEqual(self.stmr_adams.stem('escapades'), 'escapade') + self.assertEqual(self.stmr_adams.stem('crusades'), 'crusade') + self.assertEqual(self.stmr_adams.stem('grades'), 'grade') # stem various words with -res suffix - self.assertEqual(self.stmr.stem('wires', var='Adams'), 'wire') - self.assertEqual(self.stmr.stem('acres', var='Adams'), 'acre') - self.assertEqual(self.stmr.stem('fires', var='Adams'), 'fire') - self.assertEqual(self.stmr.stem('cares', var='Adams'), 'care') + self.assertEqual(self.stmr_adams.stem('wires'), 'wire') + self.assertEqual(self.stmr_adams.stem('acres'), 'acre') + self.assertEqual(self.stmr_adams.stem('fires'), 'fire') + self.assertEqual(self.stmr_adams.stem('cares'), 'care') # stem acronyms when pluralized otherwise they should be left alone - self.assertEqual(self.stmr.stem('USA', var='Adams'), 'USA') - self.assertEqual(self.stmr.stem('FLOSS', var='Adams'), 'FLOSS') - self.assertEqual(self.stmr.stem('MREs', var='Adams'), 'MRE') - self.assertEqual(self.stmr.stem('USAED', var='Adams'), 'USAED') + self.assertEqual(self.stmr_adams.stem('USA'), 'USA') + self.assertEqual(self.stmr_adams.stem('FLOSS'), 'FLOSS') + self.assertEqual(self.stmr_adams.stem('MREs'), 'MRE') + self.assertEqual(self.stmr_adams.stem('USAED'), 'USAED') # Perl version tests self.assertEqual(self.stmr.stem('ragings'), 'rage') - self.assertEqual(self.stmr.stem('ragings', var='Perl'), 'rag') + self.assertEqual(UEALite(var='Perl').stem('ragings'), 'rag') # complete coverage self.assertEqual(self.stmr.stem('was'), 'was') self.assertEqual(self.stmr.stem('during'), 'during') self.assertEqual( - self.stmr.stem('abcdefghijklmnopqrstuvwxyz', max_word_length=20), + UEALite(max_word_length=20).stem('abcdefghijklmnopqrstuvwxyz'), 'abcdefghijklmnopqrstuvwxyz', ) self.assertEqual(self.stmr.stem('10'), '10') @@ -180,11 +181,11 @@ def test_uealite(self): self.assertEqual(self.stmr.stem('top-10'), 'top-10') self.assertEqual(self.stmr.stem('top_ten'), 'top_ten') self.assertEqual( - self.stmr.stem('ABCDEFGHIJKLMs', max_acro_length=8, var='Adams'), + UEALite(max_acro_length=8, var='Adams').stem('ABCDEFGHIJKLMs'), 'ABCDEFGHIJKLMs', ) self.assertEqual( - self.stmr.stem('ABCDEFGHIJKLM', max_acro_length=8, var='Adams'), + UEALite(max_acro_length=8, var='Adams').stem('ABCDEFGHIJKLM'), 'ABCDEFGHIJKLM', ) self.assertEqual(self.stmr.stem('abcDefGhij'), 'abcDefGhij') @@ -195,13 +196,11 @@ def test_uealite(self): def test_uealite_wsj_set(self): """Test abydos.stemmer.UEALite (WSJ testset).""" + stmr_rrn = UEALite(return_rule_no=True) with open(_corpus_file('uea-lite_wsj.csv')) as wsj_ts: for wsj_line in wsj_ts: (word, uea, rule) = wsj_line.strip().split(',') - self.assertEqual( - self.stmr.stem(word, return_rule_no=True), - (uea, float(rule)), - ) + self.assertEqual(stmr_rrn.stem(word), (uea, float(rule))) if __name__ == '__main__': diff --git a/tests/tokenizer/test_tokenizer__tokenizer.py b/tests/tokenizer/test_tokenizer__tokenizer.py new file mode 100644 index 000000000..3403f2b08 --- /dev/null +++ b/tests/tokenizer/test_tokenizer__tokenizer.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_tokenizer. + +This module contains unit tests for abydos.tokenizer._Tokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import sys +import unittest +from collections import Counter +from math import log1p + +from abydos.tokenizer import QGrams, QSkipgrams, _Tokenizer + + +class TokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer._Tokenizer.""" + + def test__tokenizer(self): + """Test abydos.tokenizer._Tokenizer.""" + self.assertEqual( + _Tokenizer().tokenize('').get_counter(), Counter({'': 1}) + ) + self.assertEqual( + _Tokenizer().tokenize('a').get_counter(), Counter({'a': 1}) + ) + + self.assertEqual( + _Tokenizer().tokenize('NELSON').get_counter(), + Counter({'NELSON': 1}), + ) + self.assertEqual( + _Tokenizer().tokenize('NEILSEN').get_counter(), + Counter({'NEILSEN': 1}), + ) + self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1) + self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1) + + tweet = 'Good to be home for a night' + self.assertEqual( + _Tokenizer().tokenize(tweet).get_counter(), + Counter({'Good to be home for a night': 1}), + ) + + nelson = QGrams().tokenize('NELSON') + neilsen = QGrams().tokenize('NEILSEN') + self.assertEqual( + nelson.get_set(), {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'} + ) + self.assertEqual( + nelson.get_list(), ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#'] + ) + if sys.version_info >= (3, 6): + self.assertEqual( + repr(nelson), + "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \ +'N#': 1})", + ) + self.assertEqual( + nelson & neilsen, Counter({'$N': 1, 'NE': 1, 'LS': 1, 'N#': 1}) + ) + self.assertEqual( + nelson + neilsen, + Counter( + { + '$N': 2, + 'NE': 2, + 'EL': 1, + 'LS': 2, + 'SO': 1, + 'ON': 1, + 'N#': 2, + 'EI': 1, + 'IL': 1, + 'SE': 1, + 'EN': 1, + } + ), + ) + self.assertEqual( + nelson - neilsen, Counter({'EL': 1, 'SO': 1, 'ON': 1}) + ) + + nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON') + self.assertEqual(nelsonnelson.count(), 8) + + nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON') + self.assertAlmostEqual(nelson_ssk.count(), 18.66784401) + + nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON') + gold_standard = Counter( + { + '$$N': 1.0986122886681096, + '$$E': 0.6931471805599453, + '$$L': 0.6931471805599453, + '$$S': 0.6931471805599453, + '$$O': 0.6931471805599453, + '$$#': 1.0986122886681096, + '$NE': 1.0986122886681096, + '$NL': 1.0986122886681096, + '$NS': 1.0986122886681096, + '$NO': 1.0986122886681096, + '$NN': 1.0986122886681096, + '$N#': 2.1972245773362196, + '$EL': 1.0986122886681096, + '$ES': 1.0986122886681096, + '$EO': 1.0986122886681096, + '$EN': 1.0986122886681096, + '$E#': 1.6094379124341003, + '$LS': 1.0986122886681096, + '$LO': 1.0986122886681096, + '$LN': 1.0986122886681096, + '$L#': 1.6094379124341003, + '$SO': 1.0986122886681096, + '$SN': 1.0986122886681096, + '$S#': 1.6094379124341003, + '$ON': 1.0986122886681096, + '$O#': 1.6094379124341003, + '$##': 1.0986122886681096, + 'NEL': 0.6931471805599453, + 'NES': 0.6931471805599453, + 'NEO': 0.6931471805599453, + 'NEN': 0.6931471805599453, + 'NE#': 1.0986122886681096, + 'NLS': 0.6931471805599453, + 'NLO': 0.6931471805599453, + 'NLN': 0.6931471805599453, + 'NL#': 1.0986122886681096, + 'NSO': 0.6931471805599453, + 'NSN': 0.6931471805599453, + 'NS#': 1.0986122886681096, + 'NON': 0.6931471805599453, + 'NO#': 1.0986122886681096, + 'NN#': 1.0986122886681096, + 'N##': 1.0986122886681096, + 'ELS': 0.6931471805599453, + 'ELO': 0.6931471805599453, + 'ELN': 0.6931471805599453, + 'EL#': 1.0986122886681096, + 'ESO': 0.6931471805599453, + 'ESN': 0.6931471805599453, + 'ES#': 1.0986122886681096, + 'EON': 0.6931471805599453, + 'EO#': 1.0986122886681096, + 'EN#': 1.0986122886681096, + 'E##': 0.6931471805599453, + 'LSO': 0.6931471805599453, + 'LSN': 0.6931471805599453, + 'LS#': 1.0986122886681096, + 'LON': 0.6931471805599453, + 'LO#': 1.0986122886681096, + 'LN#': 1.0986122886681096, + 'L##': 0.6931471805599453, + 'SON': 0.6931471805599453, + 'SO#': 1.0986122886681096, + 'SN#': 1.0986122886681096, + 'S##': 0.6931471805599453, + 'ON#': 1.0986122886681096, + 'O##': 0.6931471805599453, + } + ) + test_counter = nelson_log.get_counter() + for key in test_counter: + self.assertAlmostEqual(test_counter[key], gold_standard[key]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_c_or_v_cluster.py b/tests/tokenizer/test_tokenizer_c_or_v_cluster.py new file mode 100644 index 000000000..402765b95 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_c_or_v_cluster.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_c_or_v_cluster. + +This module contains unit tests for abydos.tokenizer.COrVClusterTokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import COrVClusterTokenizer + +from six import PY2 + + +class COrVClusterTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.COrVClusterTokenizer.""" + + def test_c_or_v_cluster_tokenizer(self): + """Test abydos.tokenizer.COrVClusterTokenizer.""" + self.assertEqual( + sorted(COrVClusterTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(COrVClusterTokenizer().tokenize('a').get_list()), ['a'] + ) + + tok = COrVClusterTokenizer() + + self.assertEqual( + sorted(tok.tokenize('nelson').get_list()), + sorted(['n', 'e', 'ls', 'o', 'n']), + ) + self.assertEqual( + sorted(tok.tokenize('neilson').get_list()), + sorted(['n', 'ei', 'ls', 'o', 'n']), + ) + self.assertEqual( + sorted(tok.tokenize('peninsular').get_list()), + sorted(['p', 'e', 'n', 'i', 'ns', 'u', 'l', 'a', 'r']), + ) + self.assertEqual( + sorted(tok.tokenize('spectacular').get_list()), + sorted(['sp', 'e', 'ct', 'a', 'c', 'u', 'l', 'a', 'r']), + ) + self.assertEqual( + sorted(tok.tokenize('sufficiently').get_list()), + sorted(['s', 'u', 'ff', 'i', 'c', 'ie', 'ntl', 'y']), + ) + self.assertEqual( + sorted(tok.tokenize('yachting').get_list()), + sorted(['ya', 'cht', 'i', 'ng']), + ) + self.assertEqual( + sorted(tok.tokenize('caterpillars').get_list()), + sorted(['c', 'a', 't', 'e', 'rp', 'i', 'll', 'a', 'rs']), + ) + if not PY2: + self.assertEqual( + sorted(tok.tokenize('Götterdämmerung').get_list()), + sorted( + ['G', 'ö', 'tt', 'e', 'rd', 'ä', 'mm', 'e', 'r', 'u', 'ng'] + ), + ) + + tok = COrVClusterTokenizer(consonants='ptkbdgmn', vowels='aeiouwy') + self.assertEqual( + sorted(tok.tokenize('#winning #losing').get_list()), + sorted(['#', 'wi', 'nn', 'i', 'ng', '#', 'losing']), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_character.py b/tests/tokenizer/test_tokenizer_character.py new file mode 100644 index 000000000..6574ca346 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_character.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import CharacterTokenizer + + +class CharacterTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.CharacterTokenizer.""" + + def test_character_tokenizer(self): + """Test abydos.tokenizer.CharacterTokenizer.""" + self.assertEqual( + sorted(CharacterTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(CharacterTokenizer().tokenize('a').get_list()), ['a'] + ) + + self.assertEqual( + sorted(CharacterTokenizer().tokenize('NELSON').get_list()), + sorted(['N', 'E', 'L', 'S', 'O', 'N']), + ) + + def test_character_tokenizer_intersections(self): + """Test abydos.tokenizer.CharacterTokenizer intersections.""" + self.assertEqual( + sorted( + CharacterTokenizer().tokenize('NELSON') + & CharacterTokenizer().tokenize('') + ), + [], + ) + self.assertEqual( + sorted( + CharacterTokenizer().tokenize('') + & CharacterTokenizer().tokenize('NEILSEN') + ), + [], + ) + self.assertEqual( + sorted( + CharacterTokenizer().tokenize('NELSON') + & CharacterTokenizer().tokenize('NEILSEN') + ), + sorted(['N', 'E', 'L', 'S']), + ) + self.assertEqual( + sorted( + CharacterTokenizer().tokenize('NAIL') + & CharacterTokenizer().tokenize('LIAN') + ), + sorted(['N', 'A', 'I', 'L']), + ) + + def test_character_tokenizer_counts(self): + """Test abydos.tokenizer.CharacterTokenizer counts.""" + self.assertEqual(CharacterTokenizer().tokenize('').count(), 0) + self.assertEqual(len(CharacterTokenizer().tokenize('').get_list()), 0) + + self.assertEqual(CharacterTokenizer().tokenize('NEILSEN').count(), 7) + self.assertEqual(CharacterTokenizer().tokenize('NELSON').count(), 6) + + self.assertEqual( + len(CharacterTokenizer().tokenize('NEILSEN').get_list()), 7 + ) + self.assertEqual( + len(CharacterTokenizer().tokenize('NELSON').get_list()), 6 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_cv_cluster.py b/tests/tokenizer/test_tokenizer_cv_cluster.py new file mode 100644 index 000000000..d11953559 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_cv_cluster.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_cv_cluster. + +This module contains unit tests for abydos.tokenizer.CVClusterTokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import CVClusterTokenizer + +from six import PY2 + + +class CVClusterTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.CVClusterTokenizer.""" + + def test_cv_cluster_tokenizer(self): + """Test abydos.tokenizer.CVClusterTokenizer.""" + self.assertEqual( + sorted(CVClusterTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(CVClusterTokenizer().tokenize('a').get_list()), ['a'] + ) + + tok = CVClusterTokenizer() + + self.assertEqual( + sorted(tok.tokenize('nelson').get_list()), + sorted(['ne', 'lso', 'n']), + ) + self.assertEqual( + sorted(tok.tokenize('neilson').get_list()), + sorted(['nei', 'lso', 'n']), + ) + self.assertEqual( + sorted(tok.tokenize('peninsular').get_list()), + sorted(['pe', 'ni', 'nsu', 'la', 'r']), + ) + self.assertEqual( + sorted(tok.tokenize('spectacular').get_list()), + sorted(['spe', 'cta', 'cu', 'la', 'r']), + ) + self.assertEqual( + sorted(tok.tokenize('sufficiently').get_list()), + sorted(['su', 'ffi', 'cie', 'ntly']), + ) + self.assertEqual( + sorted(tok.tokenize('yachting').get_list()), + sorted(['ya', 'chti', 'ng']), + ) + self.assertEqual( + sorted(tok.tokenize('caterpillars').get_list()), + sorted(['ca', 'te', 'rpi', 'lla', 'rs']), + ) + if not PY2: + self.assertEqual( + sorted(tok.tokenize('Götterdämmerung').get_list()), + sorted(['Gö', 'tte', 'rdä', 'mme', 'ru', 'ng']), + ) + + tok = CVClusterTokenizer(consonants='ptkbdgmn', vowels='aeiouwy') + self.assertEqual( + sorted(tok.tokenize('#winning #losing').get_list()), + sorted(['#', 'wi', 'nni', 'ng', '#', 'losing']), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_legalipy.py b/tests/tokenizer/test_tokenizer_legalipy.py new file mode 100644 index 000000000..344e58994 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_legalipy.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import LegaliPyTokenizer + +from six import PY2 + +from .. import _corpus_file + + +class LegaliPyTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.LegaliPyTokenizer.""" + + def test_legalipy_tokenizer(self): + """Test abydos.tokenizer.LegaliPyTokenizer.""" + if PY2: # skip tests of SyllabiPy on Python 2.7 + return + try: + from syllabipy.legalipy import LegaliPy # noqa: F401 + except ImportError: # pragma: no cover + return + + self.assertEqual( + sorted(LegaliPyTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(LegaliPyTokenizer().tokenize('a').get_list()), ['a'] + ) + + self.assertEqual( + sorted(LegaliPyTokenizer().tokenize('nelson').get_list()), + sorted(['n', 'els', 'on']), + ) + self.assertEqual( + sorted(LegaliPyTokenizer().tokenize('neilson').get_list()), + sorted(['n', 'eils', 'on']), + ) + + tok = LegaliPyTokenizer() + with open(_corpus_file('wikipediaCommonMisspellings.csv')) as corpus: + text = ' '.join([_.split(',')[1] for _ in corpus.readlines()]) + tok.train_onsets(text) + + with open(_corpus_file('misspellings.csv')) as corpus: + text = ' '.join([_.split(',')[1] for _ in corpus.readlines()]) + tok.train_onsets(text, append=True) + + self.assertEqual( + sorted(tok.tokenize('nelson').get_list()), sorted(['nel', 'son']) + ) + self.assertEqual( + sorted(tok.tokenize('neilson').get_list()), + sorted(['ne', 'il', 'son']), + ) + self.assertEqual( + sorted(tok.tokenize('peninsular').get_list()), + sorted(['pe', 'nin', 'su', 'lar']), + ) + self.assertEqual( + sorted(tok.tokenize('spectacular').get_list()), + sorted(['spec', 'ta', 'cu', 'lar']), + ) + self.assertEqual( + sorted(tok.tokenize('sufficiently').get_list()), + sorted(['suf', 'fi', 'ci', 'ent', 'ly']), + ) + self.assertEqual( + sorted(tok.tokenize('yachting').get_list()), + sorted(['y', 'ach', 'ting']), + ) + self.assertEqual( + sorted(tok.tokenize('caterpillars').get_list()), + sorted(['ca', 'ter', 'pil', 'lars']), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_nltk.py b/tests/tokenizer/test_tokenizer_nltk.py new file mode 100644 index 000000000..0d4ded9e6 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_nltk.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import NLTKTokenizer + +try: + from nltk import TweetTokenizer +except ImportError: # pragma: no cover + TweetTokenizer = None + +from six import PY2 + + +class NLTKTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.NLTKTokenizer.""" + + def test_nltk_tokenizer(self): + """Test abydos.tokenizer.NLTKTokenizer.""" + if PY2: # skip tests of NLTK on Python 2.7 + return + + if TweetTokenizer is None: # pragma: no cover + return + + tok = NLTKTokenizer(nltk_tokenizer=TweetTokenizer()) + + self.assertEqual(sorted(tok.tokenize('').get_list()), []) + self.assertEqual(sorted(tok.tokenize('a').get_list()), ['a']) + + self.assertEqual( + sorted(tok.tokenize('NELSON').get_list()), sorted(['NELSON']) + ) + self.assertEqual( + sorted(tok.tokenize('NEILSEN').get_list()), sorted(['NEILSEN']) + ) + + tweet1 = 'Big night of basketball - @Warriors chasing 73 and a\ + farewell for an all-timer, @KobeBryant. NBA fans feeling like:' + self.assertEqual( + sorted(tok.tokenize(tweet1).get_list()), + sorted( + [ + 'Big', + 'night', + 'of', + 'basketball', + '-', + '@Warriors', + 'chasing', + '73', + 'and', + 'a', + 'farewell', + 'for', + 'an', + 'all-timer', + ',', + '@KobeBryant', + '.', + 'NBA', + 'fans', + 'feeling', + 'like', + ':', + ] + ), + ) + + tweet2 = 'Einstein was right! Congrats to @NSF and @LIGO on detecting\ + gravitational waves - a huge breakthrough in how we understand the\ + universe.' + self.assertEqual( + sorted(tok.tokenize(tweet2).get_list()), + sorted( + [ + 'Einstein', + 'was', + 'right', + '!', + 'Congrats', + 'to', + '@NSF', + 'and', + '@LIGO', + 'on', + 'detecting', + 'gravitational', + 'waves', + '-', + 'a', + 'huge', + 'breakthrough', + 'in', + 'how', + 'we', + 'understand', + 'the', + 'universe', + '.', + ] + ), + ) + + with self.assertRaises(TypeError): + NLTKTokenizer(nltk_tokenizer=TweetTokenizer) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_q_grams.py b/tests/tokenizer/test_tokenizer_q_grams.py new file mode 100644 index 000000000..af5995563 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_q_grams.py @@ -0,0 +1,392 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_q_grams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import sys +import unittest +from collections import Counter +from math import log1p + +from abydos.tokenizer import QGrams + + +class QGramsTestCases(unittest.TestCase): + """Test abydos.tokenizer.QGrams.""" + + def test_qgrams(self): + """Test abydos.tokenizer.QGrams.""" + self.assertEqual(sorted(QGrams().tokenize('').get_list()), []) + self.assertEqual( + sorted(QGrams(2).tokenize('a').get_list()), ['$a', 'a#'] + ) + self.assertEqual(sorted(QGrams(-1).tokenize('NELSON').get_list()), []) + + self.assertEqual( + sorted(QGrams(3).tokenize('NELSON').get_list()), + sorted(['$$N', '$NE', 'NEL', 'ELS', 'LSO', 'SON', 'ON#', 'N##']), + ) + self.assertEqual( + sorted(QGrams(7).tokenize('NELSON').get_list()), + sorted( + [ + '$$$$$$N', + '$$$$$NE', + '$$$$NEL', + '$$$NELS', + '$$NELSO', + '$NELSON', + 'ELSON##', + 'LSON###', + 'N######', + 'NELSON#', + 'ON#####', + 'SON####', + ] + ), + ) + + # http://www.sound-ex.com/alternative_qgram.htm + self.assertEqual( + sorted(QGrams().tokenize('NELSON').get_list()), + sorted(['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']), + ) + self.assertEqual( + sorted(QGrams().tokenize('NEILSEN').get_list()), + sorted(['$N', 'NE', 'EI', 'IL', 'LS', 'SE', 'EN', 'N#']), + ) + self.assertEqual( + sorted(QGrams(start_stop='').tokenize('NELSON').get_list()), + sorted(['NE', 'EL', 'LS', 'SO', 'ON']), + ) + self.assertEqual( + sorted(QGrams(start_stop='').tokenize('NEILSEN').get_list()), + sorted(['NE', 'EI', 'IL', 'LS', 'SE', 'EN']), + ) + + # qval=(1,2) + self.assertEqual( + sorted(QGrams(qval=(1, 2)).tokenize('NELSON').get_list()), + sorted( + [ + '$N', + 'E', + 'EL', + 'L', + 'LS', + 'N', + 'N', + 'N#', + 'NE', + 'O', + 'ON', + 'S', + 'SO', + ] + ), + ) + self.assertEqual( + sorted(QGrams(qval=(2, 1)).tokenize('NELSON').get_list()), + sorted( + [ + '$N', + 'E', + 'EL', + 'L', + 'LS', + 'N', + 'N', + 'N#', + 'NE', + 'O', + 'ON', + 'S', + 'SO', + ] + ), + ) + self.assertEqual( + sorted(QGrams(qval=range(3)).tokenize('NELSON').get_list()), + sorted( + [ + '$N', + 'E', + 'EL', + 'L', + 'LS', + 'N', + 'N', + 'N#', + 'NE', + 'O', + 'ON', + 'S', + 'SO', + ] + ), + ) + self.assertEqual(QGrams(qval=(1, 2)).tokenize('NELSON').count(), 13) + + # skip=(1,2) + self.assertEqual( + sorted(QGrams(skip=(2, 1, 0)).tokenize('NELSON').get_list()), + sorted( + [ + '$E', + '$L', + '$N', + 'EL', + 'EO', + 'ES', + 'LN', + 'LO', + 'LS', + 'N', + 'N', + 'N#', + 'NE', + 'NL', + 'NS', + 'O', + 'O#', + 'ON', + 'S#', + 'SN', + 'SO', + ] + ), + ) + self.assertEqual( + sorted(QGrams(skip=(2, 1, 0)).tokenize('NELSON').get_list()), + sorted( + [ + '$E', + '$L', + '$N', + 'EL', + 'EO', + 'ES', + 'LN', + 'LO', + 'LS', + 'N', + 'N', + 'N#', + 'NE', + 'NL', + 'NS', + 'O', + 'O#', + 'ON', + 'S#', + 'SN', + 'SO', + ] + ), + ) + self.assertEqual( + sorted(QGrams(skip=range(3)).tokenize('NELSON').get_list()), + sorted( + [ + '$E', + '$L', + '$N', + 'EL', + 'EO', + 'ES', + 'LN', + 'LO', + 'LS', + 'N', + 'N', + 'N#', + 'NE', + 'NL', + 'NS', + 'O', + 'O#', + 'ON', + 'S#', + 'SN', + 'SO', + ] + ), + ) + self.assertEqual(QGrams(skip=(0, 1, 2)).tokenize('NELSON').count(), 21) + self.assertEqual( + QGrams(qval=1).tokenize('COLIN').get_counter(), + Counter({'C': 1, 'O': 1, 'L': 1, 'I': 1, 'N': 1}), + ) + self.assertEqual( + QGrams(qval=10, start_stop='').tokenize('COLIN').get_counter(), + Counter({}), + ) + if sys.version_info >= (3, 6): + self.assertEqual( + repr(QGrams(qval=1).tokenize('COLIN')), + "QGrams({'C': 1, 'O': 1, 'L': 1, 'I': 1, 'N': 1})", + ) + self.assertEqual( + QGrams(qval=1).tokenize('COLIN').get_set(), + {'C', 'O', 'L', 'I', 'N'}, + ) + + # Test exception + self.assertRaises(ValueError, QGrams, 0) + + def test_qgrams_intersections(self): + """Test abydos.tokenizer.QGrams intersections.""" + self.assertEqual( + sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('')), [] + ) + self.assertEqual( + sorted(QGrams().tokenize('') & QGrams().tokenize('NEILSEN')), [] + ) + self.assertEqual( + sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('NEILSEN')), + sorted(['$N', 'NE', 'LS', 'N#']), + ) + self.assertEqual( + sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('NOSLEN')), + sorted(['$N', 'N#']), + ) + self.assertEqual( + sorted(QGrams().tokenize('NAIL') & QGrams().tokenize('LIAN')), [] + ) + + self.assertEqual( + sorted( + QGrams(start_stop='').tokenize('NELSON') + & QGrams(start_stop='').tokenize('NEILSEN') + ), + sorted(['NE', 'LS']), + ) + self.assertEqual( + sorted( + QGrams(start_stop='').tokenize('NELSON') + & QGrams(start_stop='').tokenize('NOSLEN') + ), + [], + ) + self.assertEqual( + sorted( + QGrams(start_stop='').tokenize('NAIL') + & QGrams(start_stop='').tokenize('LIAN') + ), + [], + ) + + def test_qgrams_counts(self): + """Test abydos.tokenizer.QGrams counts.""" + self.assertEqual(QGrams().tokenize('').count(), 0) + self.assertEqual(len(QGrams().tokenize('').get_list()), 0) + + self.assertEqual(QGrams().tokenize('NEILSEN').count(), 8) + self.assertEqual(QGrams().tokenize('NELSON').count(), 7) + self.assertEqual(QGrams(start_stop='').tokenize('NEILSEN').count(), 6) + self.assertEqual(QGrams(start_stop='').tokenize('NELSON').count(), 5) + + self.assertEqual(len(QGrams().tokenize('NEILSEN').get_list()), 8) + self.assertEqual(len(QGrams().tokenize('NELSON').get_list()), 7) + self.assertEqual( + len(QGrams(start_stop='').tokenize('NEILSEN').get_list()), 6 + ) + self.assertEqual( + len(QGrams(start_stop='').tokenize('NELSON').get_list()), 5 + ) + + self.assertEqual( + QGrams(scaler='set').tokenize('ACAACACCTAG').get_counter(), + Counter( + { + '$A': 1, + 'AC': 1, + 'CA': 1, + 'AA': 1, + 'CC': 1, + 'CT': 1, + 'TA': 1, + 'AG': 1, + 'G#': 1, + } + ), + ) + + gold_standard = Counter( + { + '$A': 0.6931471805599453, + 'AC': 1.3862943611198906, + 'CA': 1.0986122886681096, + 'AA': 0.6931471805599453, + 'CC': 0.6931471805599453, + 'CT': 0.6931471805599453, + 'TA': 0.6931471805599453, + 'AG': 0.6931471805599453, + 'G#': 0.6931471805599453, + } + ) + test_counter = ( + QGrams(scaler=log1p).tokenize('ACAACACCTAG').get_counter() + ) + for key in test_counter: + self.assertAlmostEqual(test_counter[key], gold_standard[key]) + + self.assertEqual( + QGrams(scaler=log1p).tokenize('ACAACACCTAG').count_unique(), 9 + ) + + tokens1 = QGrams().tokenize('ACAACACCTAG') + tokens2 = QGrams().tokenize('GAAGATAC') + self.assertEqual( + tokens1 - tokens2, + Counter({'$A': 1, 'AC': 2, 'CA': 2, 'CC': 1, 'CT': 1, 'G#': 1}), + ) + self.assertEqual( + tokens1 + tokens2, + Counter( + { + '$A': 1, + 'AC': 4, + 'CA': 2, + 'AA': 2, + 'CC': 1, + 'CT': 1, + 'TA': 2, + 'AG': 2, + 'G#': 1, + '$G': 1, + 'GA': 2, + 'AT': 1, + 'C#': 1, + } + ), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_q_skipgrams.py b/tests/tokenizer/test_tokenizer_q_skipgrams.py new file mode 100644 index 000000000..3ab46b036 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_q_skipgrams.py @@ -0,0 +1,438 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_q_skipgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest +from collections import Counter + +from abydos.tokenizer import QSkipgrams + + +class QSkipgramsTestCases(unittest.TestCase): + """Test abydos.tokenizer.QSkipgrams.""" + + def test_qskipgrams(self): + """Test abydos.tokenizer.QSkipgrams.""" + self.assertEqual(sorted(QSkipgrams().tokenize('').get_list()), []) + self.assertEqual( + sorted(QSkipgrams(start_stop='').tokenize('a').get_list()), [] + ) + self.assertEqual( + sorted(QSkipgrams().tokenize('a').get_list()), ['$#', '$a', 'a#'] + ) + self.assertEqual( + sorted(QSkipgrams().tokenize('ab').get_list()), + sorted(['$a', '$b', '$#', 'ab', 'a#', 'b#']), + ) + + self.assertEqual( + sorted(QSkipgrams().tokenize('NELSON').get_list()), + sorted( + [ + '$N', + '$E', + '$L', + '$S', + '$O', + '$N', + '$#', + 'NE', + 'NL', + 'NS', + 'NO', + 'NN', + 'N#', + 'EL', + 'ES', + 'EO', + 'EN', + 'E#', + 'LS', + 'LO', + 'LN', + 'L#', + 'SO', + 'SN', + 'S#', + 'ON', + 'O#', + 'N#', + ] + ), + ) + self.assertEqual( + sorted(QSkipgrams().tokenize('NEILSEN').get_list()), + sorted( + [ + '$N', + '$E', + '$I', + '$L', + '$S', + '$E', + '$N', + '$#', + 'NE', + 'NI', + 'NL', + 'NS', + 'NE', + 'NN', + 'N#', + 'EI', + 'EL', + 'ES', + 'EE', + 'EN', + 'E#', + 'IL', + 'IS', + 'IE', + 'IN', + 'I#', + 'LS', + 'LE', + 'LN', + 'L#', + 'SE', + 'SN', + 'S#', + 'EN', + 'E#', + 'N#', + ] + ), + ) + + self.assertEqual( + sorted(QSkipgrams(qval=1).tokenize('NEILSEN').get_list()), + sorted(['N', 'E', 'I', 'L', 'S', 'E', 'N']), + ) + self.assertEqual( + QSkipgrams(qval=(2,), scaler='SSK') + .tokenize('NEILSEN') + .get_counter(), + Counter( + { + '$N': 1.2404672100000003, + '$E': 1.2072969000000002, + '$I': 0.6561, + '$L': 0.5904900000000001, + '$S': 0.531441, + '$#': 0.3874204890000001, + 'NE': 1.341441, + 'NI': 0.7290000000000001, + 'NL': 0.6561, + 'NS': 0.5904900000000001, + 'NN': 0.4782969000000001, + 'N#': 1.2404672100000003, + 'EI': 0.81, + 'EL': 0.7290000000000001, + 'ES': 0.6561, + 'EE': 0.5904900000000001, + 'EN': 1.341441, + 'E#': 1.2072969000000002, + 'IL': 0.81, + 'IS': 0.7290000000000001, + 'IE': 0.6561, + 'IN': 0.5904900000000001, + 'I#': 0.531441, + 'LS': 0.81, + 'LE': 0.7290000000000001, + 'LN': 0.6561, + 'L#': 0.5904900000000001, + 'SE': 0.81, + 'SN': 0.7290000000000001, + 'S#': 0.6561, + } + ), + ) + self.assertEqual( + QSkipgrams(qval=(4, 6, 5, 1, 0), scaler='SSK') + .tokenize('NIALL') + .get_counter(), + Counter( + { + '$$$N': 0.531441, + '$$$I': 0.4782969000000001, + '$$$A': 0.4304672100000001, + '$$$L': 0.7360989291000002, + '$$$#': 0.8504267154039002, + '$$NI': 1.4880348000000003, + '$$NA': 1.3392313200000003, + '$$NL': 2.2900855572000007, + '$$N#': 2.645772003478801, + '$$IA': 1.3392313200000003, + '$$IL': 2.2900855572000007, + '$$I#': 2.645772003478801, + '$$AL': 2.2900855572000007, + '$$A#': 2.645772003478801, + '$$LL': 1.0847773692000002, + '$$L#': 5.291544006957601, + '$$##': 2.460275073345601, + '$NIA': 1.4402051100000002, + '$NIL': 2.462750738100001, + '$NI#': 2.845254813264901, + '$NAL': 2.462750738100001, + '$NA#': 2.845254813264901, + '$NLL': 1.1665661391000004, + '$NL#': 5.690509626529802, + '$N##': 2.645772003478801, + '$IAL': 2.462750738100001, + '$IA#': 2.845254813264901, + '$ILL': 1.1665661391000004, + '$IL#': 5.690509626529802, + '$I##': 2.645772003478801, + '$ALL': 1.1665661391000004, + '$AL#': 5.690509626529802, + '$A##': 2.645772003478801, + '$LL#': 2.845254813264901, + '$L##': 5.291544006957601, + '$###': 0.8504267154039002, + 'NIAL': 1.0097379000000002, + 'NIA#': 1.1665661391000002, + 'NILL': 0.4782969000000001, + 'NIL#': 2.3331322782000004, + 'NI##': 1.0847773692000002, + 'NALL': 0.4782969000000001, + 'NAL#': 2.3331322782000004, + 'NA##': 1.0847773692000002, + 'NLL#': 1.1665661391000002, + 'NL##': 2.1695547384000005, + 'N###': 0.3486784401000001, + 'IALL': 0.531441, + 'IAL#': 2.5923691980000005, + 'IA##': 1.2053081880000003, + 'ILL#': 1.2961845990000003, + 'IL##': 2.4106163760000006, + 'I###': 0.3874204890000001, + 'ALL#': 1.4402051100000004, + 'AL##': 2.6784626400000007, + 'A###': 0.4304672100000001, + 'LL##': 1.4880348000000003, + 'L###': 1.0097379000000002, + '$$$$$N': 0.3486784401000001, + '$$$$$I': 0.31381059609000006, + '$$$$$A': 0.2824295364810001, + '$$$$$L': 0.48295450738251017, + '$$$$$#': 0.8431447750407974, + '$$$$NI': 1.6039208244600003, + '$$$$NA': 1.4435287420140006, + '$$$$NL': 2.468434148843941, + '$$$$N#': 4.309406627986299, + '$$$$IA': 1.4435287420140006, + '$$$$IL': 2.468434148843941, + '$$$$I#': 4.309406627986299, + '$$$$AL': 2.468434148843941, + '$$$$A#': 4.309406627986299, + '$$$$LL': 1.1692582810313406, + '$$$$L#': 8.618813255972597, + '$$$$##': 7.715070145397851, + '$$$NIA': 2.984687447256001, + '$$$NIL': 5.103815534807762, + '$$$NI#': 8.910270709073119, + '$$$NAL': 5.103815534807762, + '$$$NA#': 8.910270709073119, + '$$$NLL': 2.417596832277361, + '$$$NL#': 17.82054141814625, + '$$$N##': 15.951932474542438, + '$$$IAL': 5.103815534807762, + '$$$IA#': 8.910270709073119, + '$$$ILL': 2.417596832277361, + '$$$IL#': 17.82054141814625, + '$$$I##': 15.951932474542438, + '$$$ALL': 2.417596832277361, + '$$$AL#': 17.82054141814625, + '$$$A##': 15.951932474542438, + '$$$LL#': 8.910270709073119, + '$$$L##': 31.903864949084834, + '$$$###': 15.08638445665049, + '$$NIAL': 5.396635688803742, + '$$NIA#': 9.42147782919388, + '$$NILL': 2.556301115749141, + '$$NIL#': 18.84295565838777, + '$$NI##': 16.867139400002937, + '$$NALL': 2.556301115749141, + '$$NAL#': 18.84295565838777, + '$$NA##': 16.867139400002937, + '$$NLL#': 9.42147782919388, + '$$NL##': 33.73427880000585, + '$$N###': 15.951932474542435, + '$$IALL': 2.556301115749141, + '$$IAL#': 18.84295565838777, + '$$IA##': 16.867139400002937, + '$$ILL#': 9.42147782919388, + '$$IL##': 33.73427880000585, + '$$I###': 15.951932474542435, + '$$ALL#': 9.42147782919388, + '$$AL##': 33.73427880000585, + '$$A###': 15.951932474542435, + '$$LL##': 16.867139400002937, + '$$L###': 31.903864949084824, + '$$####': 7.715070145397851, + '$NIALL': 1.4278730800535104, + '$NIAL#': 10.525109490228838, + '$NIA##': 9.421477829193876, + '$NILL#': 5.262554745114417, + '$NIL##': 18.842955658387766, + '$NI###': 8.910270709073117, + '$NALL#': 5.262554745114417, + '$NAL##': 18.842955658387766, + '$NA###': 8.910270709073117, + '$NLL##': 9.421477829193876, + '$NL###': 17.820541418146256, + '$N####': 4.309406627986299, + '$IALL#': 5.262554745114417, + '$IAL##': 18.842955658387766, + '$IA###': 8.910270709073117, + '$ILL##': 9.421477829193876, + '$IL###': 17.820541418146256, + '$I####': 4.309406627986299, + '$ALL##': 9.421477829193876, + '$AL###': 17.820541418146256, + '$A####': 4.309406627986299, + '$LL###': 8.910270709073117, + '$L####': 8.618813255972595, + '$#####': 0.8431447750407974, + 'NIALL#': 1.4278730800535104, + 'NIAL##': 5.112602231498281, + 'NIA###': 2.417596832277361, + 'NILL##': 2.556301115749141, + 'NIL###': 4.835193664554721, + 'NI####': 1.1692582810313406, + 'NALL##': 2.556301115749141, + 'NAL###': 4.835193664554721, + 'NA####': 1.1692582810313406, + 'NLL###': 2.417596832277361, + 'NL####': 2.338516562062681, + 'N#####': 0.2287679245496101, + 'IALL##': 2.8403345730546006, + 'IAL###': 5.3724374050608015, + 'IA####': 1.2991758678126004, + 'ILL###': 2.6862187025304003, + 'IL####': 2.5983517356252004, + 'I#####': 0.2541865828329001, + 'ALL###': 2.984687447256001, + 'AL####': 2.887057484028001, + 'A#####': 0.2824295364810001, + 'LL####': 1.6039208244600003, + 'L#####': 0.6624890361900002, + '$$$$N': 0.4304672100000001, + '$$$$I': 0.3874204890000001, + '$$$$A': 0.3486784401000001, + '$$$$L': 0.5962401325710002, + '$$$$#': 0.8741476583623434, + '$$$NI': 1.5927286770000002, + '$$$NA': 1.4334558093000005, + '$$$NL': 2.4512094339030006, + '$$$N#': 3.59371815104519, + '$$$IA': 1.4334558093000005, + '$$$IL': 2.4512094339030006, + '$$$I#': 3.59371815104519, + '$$$AL': 2.4512094339030006, + '$$$A#': 3.59371815104519, + '$$$LL': 1.1610992055330005, + '$$$L#': 7.187436302090378, + '$$$##': 4.91876456439945, + '$$NIA': 2.2513435083000006, + '$$NIL': 3.849797399193001, + '$$NI#': 5.644187966956859, + '$$NAL': 3.849797399193001, + '$$NA#': 5.644187966956859, + '$$NLL': 1.8235882417230007, + '$$NL#': 11.28837593391372, + '$$N##': 7.725266868411147, + '$$IAL': 3.849797399193001, + '$$IA#': 5.644187966956859, + '$$ILL': 1.8235882417230007, + '$$IL#': 11.28837593391372, + '$$I##': 7.725266868411147, + '$$ALL': 1.8235882417230007, + '$$AL#': 11.28837593391372, + '$$A##': 7.725266868411147, + '$$LL#': 5.644187966956859, + '$$L##': 15.4505337368223, + '$$###': 4.918764564399449, + '$NIAL': 2.812715796861001, + '$NIA#': 4.123722629777913, + '$NILL': 1.3323390616710005, + '$NIL#': 8.247445259555828, + '$NI##': 5.644187966956858, + '$NALL': 1.3323390616710005, + '$NAL#': 8.247445259555828, + '$NA##': 5.644187966956858, + '$NLL#': 4.123722629777913, + '$NL##': 11.288375933913724, + '$N###': 3.593718151045189, + '$IALL': 1.3323390616710005, + '$IAL#': 8.247445259555828, + '$IA##': 5.644187966956858, + '$ILL#': 4.123722629777913, + '$IL##': 11.288375933913724, + '$I###': 3.593718151045189, + '$ALL#': 4.123722629777913, + '$AL##': 11.288375933913724, + '$A###': 3.593718151045189, + '$LL##': 5.644187966956858, + '$L###': 7.187436302090377, + '$####': 0.8741476583623434, + 'NIALL': 0.4304672100000001, + 'NIAL#': 2.664678123342001, + 'NIA##': 1.8235882417230007, + 'NILL#': 1.3323390616710005, + 'NIL##': 3.6471764834460014, + 'NI###': 1.1610992055330005, + 'NALL#': 1.3323390616710005, + 'NAL##': 3.6471764834460014, + 'NA###': 1.1610992055330005, + 'NLL##': 1.8235882417230007, + 'NL###': 2.322198411066001, + 'N####': 0.2824295364810001, + 'IALL#': 1.4803767351900001, + 'IAL##': 4.0524183149400015, + 'IA###': 1.2901102283700003, + 'ILL##': 2.0262091574700007, + 'IL###': 2.5802204567400007, + 'I####': 0.31381059609000006, + 'ALL##': 2.2513435083000006, + 'AL###': 2.8669116186000005, + 'A####': 0.3486784401000001, + 'LL###': 1.5927286770000004, + 'L####': 0.8178876990000001, + 'N': 1.0, + 'I': 1.0, + 'A': 1.0, + 'L': 2.0, + } + ), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_qgrams.py b/tests/tokenizer/test_tokenizer_qgrams.py deleted file mode 100644 index c35e5553e..000000000 --- a/tests/tokenizer/test_tokenizer_qgrams.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2014-2018 by Christopher C. Little. -# This file is part of Abydos. -# -# Abydos is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Abydos is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Abydos. If not, see . - -"""abydos.tests.tokenizer.test_tokenizer_qgrams. - -This module contains unit tests for abydos.tokenizer.QGrams -""" - -from __future__ import ( - absolute_import, - division, - print_function, - unicode_literals, -) - -import unittest - -from abydos.tokenizer import QGrams - - -class QGramsTestCases(unittest.TestCase): - """Test abydos.tokenizer.QGrams.""" - - def test_qgrams(self): - """Test abydos.tokenizer.QGrams.""" - self.assertEqual(sorted(QGrams('').elements()), []) - self.assertEqual(sorted(QGrams('a', 2).elements()), []) - self.assertEqual(sorted(QGrams('NELSON', 0).elements()), []) - self.assertEqual(sorted(QGrams('NELSON', -1).elements()), []) - - self.assertEqual( - sorted(QGrams('NELSON', 3).elements()), - sorted(['$$N', '$NE', 'NEL', 'ELS', 'LSO', 'SON', 'ON#', 'N##']), - ) - self.assertEqual(sorted(QGrams('NELSON', 7).elements()), sorted([])) - - # http://www.sound-ex.com/alternative_qgram.htm - self.assertEqual( - sorted(QGrams('NELSON').elements()), - sorted(['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']), - ) - self.assertEqual( - sorted(QGrams('NEILSEN').elements()), - sorted(['$N', 'NE', 'EI', 'IL', 'LS', 'SE', 'EN', 'N#']), - ) - self.assertEqual( - sorted(QGrams('NELSON', start_stop='').elements()), - sorted(['NE', 'EL', 'LS', 'SO', 'ON']), - ) - self.assertEqual( - sorted(QGrams('NEILSEN', start_stop='').elements()), - sorted(['NE', 'EI', 'IL', 'LS', 'SE', 'EN']), - ) - - # qval=(1,2) - self.assertEqual( - sorted(QGrams('NELSON', qval=(1, 2)).elements()), - sorted( - [ - '$N', - 'E', - 'EL', - 'L', - 'LS', - 'N', - 'N', - 'N#', - 'NE', - 'O', - 'ON', - 'S', - 'SO', - ] - ), - ) - self.assertEqual( - sorted(QGrams('NELSON', qval=(2, 1)).elements()), - sorted( - [ - '$N', - 'E', - 'EL', - 'L', - 'LS', - 'N', - 'N', - 'N#', - 'NE', - 'O', - 'ON', - 'S', - 'SO', - ] - ), - ) - self.assertEqual( - sorted(QGrams('NELSON', qval=range(3)).elements()), - sorted( - [ - '$N', - 'E', - 'EL', - 'L', - 'LS', - 'N', - 'N', - 'N#', - 'NE', - 'O', - 'ON', - 'S', - 'SO', - ] - ), - ) - self.assertEqual(QGrams('NELSON', qval=(1, 2)).count(), 13) - - # skip=(1,2) - self.assertEqual( - sorted(QGrams('NELSON', skip=(2, 1, 0)).elements()), - sorted( - [ - '$E', - '$L', - '$N', - 'EL', - 'EO', - 'ES', - 'LN', - 'LO', - 'LS', - 'N', - 'N', - 'N#', - 'NE', - 'NL', - 'NS', - 'O', - 'O#', - 'ON', - 'S#', - 'SN', - 'SO', - ] - ), - ) - self.assertEqual( - sorted(QGrams('NELSON', skip=(2, 1, 0)).elements()), - sorted( - [ - '$E', - '$L', - '$N', - 'EL', - 'EO', - 'ES', - 'LN', - 'LO', - 'LS', - 'N', - 'N', - 'N#', - 'NE', - 'NL', - 'NS', - 'O', - 'O#', - 'ON', - 'S#', - 'SN', - 'SO', - ] - ), - ) - self.assertEqual( - sorted(QGrams('NELSON', skip=range(3)).elements()), - sorted( - [ - '$E', - '$L', - '$N', - 'EL', - 'EO', - 'ES', - 'LN', - 'LO', - 'LS', - 'N', - 'N', - 'N#', - 'NE', - 'NL', - 'NS', - 'O', - 'O#', - 'ON', - 'S#', - 'SN', - 'SO', - ] - ), - ) - self.assertEqual(QGrams('NELSON', skip=(0, 1, 2)).count(), 21) - - def test_qgram_intersections(self): - """Test abydos.tokenizer.QGrams intersections.""" - self.assertEqual(sorted(QGrams('NELSON') & QGrams('')), []) - self.assertEqual(sorted(QGrams('') & QGrams('NEILSEN')), []) - self.assertEqual( - sorted(QGrams('NELSON') & QGrams('NEILSEN')), - sorted(['$N', 'NE', 'LS', 'N#']), - ) - self.assertEqual( - sorted(QGrams('NELSON') & QGrams('NOSLEN')), sorted(['$N', 'N#']) - ) - self.assertEqual(sorted(QGrams('NAIL') & QGrams('LIAN')), []) - - self.assertEqual( - sorted( - QGrams('NELSON', start_stop='') - & QGrams('NEILSEN', start_stop='') - ), - sorted(['NE', 'LS']), - ) - self.assertEqual( - sorted( - QGrams('NELSON', start_stop='') - & QGrams('NOSLEN', start_stop='') - ), - [], - ) - self.assertEqual( - sorted( - QGrams('NAIL', start_stop='') & QGrams('LIAN', start_stop='') - ), - [], - ) - - def test_qgram_counts(self): - """Test abydos.tokenizer.QGrams counts.""" - self.assertEqual(QGrams('').count(), 0) - self.assertEqual(len(QGrams('')._ordered_list), 0) - - self.assertEqual(QGrams('NEILSEN').count(), 8) - self.assertEqual(QGrams('NELSON').count(), 7) - self.assertEqual(QGrams('NEILSEN', start_stop='').count(), 6) - self.assertEqual(QGrams('NELSON', start_stop='').count(), 5) - - self.assertEqual(len(QGrams('NEILSEN')._ordered_list), 8) - self.assertEqual(len(QGrams('NELSON')._ordered_list), 7) - self.assertEqual( - len(QGrams('NEILSEN', start_stop='')._ordered_list), 6 - ) - self.assertEqual(len(QGrams('NELSON', start_stop='')._ordered_list), 5) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/tokenizer/test_tokenizer_regexp.py b/tests/tokenizer/test_tokenizer_regexp.py new file mode 100644 index 000000000..f5be23abd --- /dev/null +++ b/tests/tokenizer/test_tokenizer_regexp.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import RegexpTokenizer + + +class RegexpTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.RegexpTokenizer.""" + + def test_regexp_tokenizer(self): + """Test abydos.tokenizer.RegexpTokenizer.""" + self.assertEqual(sorted(RegexpTokenizer().tokenize('').get_list()), []) + self.assertEqual( + sorted(RegexpTokenizer().tokenize('a').get_list()), ['a'] + ) + + self.assertEqual( + sorted(RegexpTokenizer().tokenize('NELSON').get_list()), + sorted(['NELSON']), + ) + self.assertEqual( + sorted(RegexpTokenizer().tokenize('NEILSEN').get_list()), + sorted(['NEILSEN']), + ) + + tweet = "Looking forward to hearing your ideas about what we can\ + accomplish this year & beyond. I'll answer your questions on\ + #AskPOTUS at 12:30p ET." + self.assertEqual( + sorted(RegexpTokenizer().tokenize(tweet).get_list()), + sorted( + [ + 'Looking', + 'forward', + 'to', + 'hearing', + 'your', + 'ideas', + 'about', + 'what', + 'we', + 'can', + 'accomplish', + 'this', + 'year', + 'beyond', + 'I', + 'll', + 'answer', + 'your', + 'questions', + 'on', + 'AskPOTUS', + 'at', + '12', + '30p', + 'ET', + ] + ), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_saps.py b/tests/tokenizer/test_tokenizer_saps.py new file mode 100644 index 000000000..2d473de62 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_saps.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import SAPSTokenizer + + +class SAPSTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.SAPSTokenizer.""" + + def test_saps_tokenizer(self): + """Test abydos.tokenizer.SAPSTokenizer.""" + self.assertEqual(sorted(SAPSTokenizer().tokenize('').get_list()), []) + self.assertEqual( + sorted(SAPSTokenizer().tokenize('a').get_list()), ['a'] + ) + + tok = SAPSTokenizer() + + self.assertEqual( + sorted(tok.tokenize('nelson').get_list()), sorted(['nel', 'son']) + ) + self.assertEqual( + sorted(tok.tokenize('neilson').get_list()), sorted(['neil', 'son']) + ) + self.assertEqual( + sorted(tok.tokenize('peninsular').get_list()), + sorted(['pe', 'nin', 'su', 'lar']), + ) + self.assertEqual( + sorted(tok.tokenize('spectacular').get_list()), + sorted(['s', 'pec', 'ta', 'cu', 'lar']), + ) + self.assertEqual( + sorted(tok.tokenize('sufficiently').get_list()), + sorted(['suf', 'fi', 'cien', 't', 'ly']), + ) + self.assertEqual( + sorted(tok.tokenize('yachting').get_list()), + sorted(['yac', 'h', 'tin', 'g']), + ) + self.assertEqual( + sorted(tok.tokenize('caterpillars').get_list()), + sorted(['ca', 'ter', 'pil', 'lar', 's']), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_sonoripy.py b/tests/tokenizer/test_tokenizer_sonoripy.py new file mode 100644 index 000000000..afbf9bfaf --- /dev/null +++ b/tests/tokenizer/test_tokenizer_sonoripy.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import SonoriPyTokenizer + +from six import PY2 + + +class SonoriPyTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.SonoriPyTokenizer.""" + + def test_sonoripy_tokenizer(self): + """Test abydos.tokenizer.SonoriPyTokenizer.""" + if PY2: # skip tests of SyllabiPy on Python 2.7 + return + + try: + from syllabipy.sonoripy import SonoriPy # noqa: F401 + except ImportError: # pragma: no cover + return + + self.assertEqual( + sorted(SonoriPyTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(SonoriPyTokenizer().tokenize('a').get_list()), ['a'] + ) + + tok = SonoriPyTokenizer() + + self.assertEqual( + sorted(tok.tokenize('nelson').get_list()), sorted(['nel', 'son']) + ) + self.assertEqual( + sorted(tok.tokenize('neilson').get_list()), sorted(['neil', 'son']) + ) + self.assertEqual( + sorted(tok.tokenize('peninsular').get_list()), + sorted(['pe', 'nin', 'su', 'lar']), + ) + self.assertEqual( + sorted(tok.tokenize('spectacular').get_list()), + sorted(['spec', 'ta', 'cu', 'lar']), + ) + self.assertEqual( + sorted(tok.tokenize('sufficiently').get_list()), + sorted(['suf', 'fi', 'cien', 'tly']), + ) + self.assertEqual( + sorted(tok.tokenize('yachting').get_list()), + sorted(['yach', 'ting']), + ) + self.assertEqual( + sorted(tok.tokenize('caterpillars').get_list()), + sorted(['ca', 'ter', 'pil', 'lars']), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_vc_cluster.py b/tests/tokenizer/test_tokenizer_vc_cluster.py new file mode 100644 index 000000000..34526dc48 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_vc_cluster.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_vc_cluster. + +This module contains unit tests for abydos.tokenizer.VCClusterTokenizer +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import VCClusterTokenizer + +from six import PY2 + + +class VCClusterTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.VCClusterTokenizer.""" + + def test_vc_cluster_tokenizer(self): + """Test abydos.tokenizer.VCClusterTokenizer.""" + self.assertEqual( + sorted(VCClusterTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(VCClusterTokenizer().tokenize('a').get_list()), ['a'] + ) + + tok = VCClusterTokenizer() + + self.assertEqual( + sorted(tok.tokenize('nelson').get_list()), + sorted(['n', 'els', 'on']), + ) + self.assertEqual( + sorted(tok.tokenize('neilson').get_list()), + sorted(['n', 'eils', 'on']), + ) + self.assertEqual( + sorted(tok.tokenize('peninsular').get_list()), + sorted(['p', 'en', 'ins', 'ul', 'ar']), + ) + self.assertEqual( + sorted(tok.tokenize('spectacular').get_list()), + sorted(['sp', 'ect', 'ac', 'ul', 'ar']), + ) + self.assertEqual( + sorted(tok.tokenize('sufficiently').get_list()), + sorted(['s', 'uff', 'ic', 'ientl', 'y']), + ) + self.assertEqual( + sorted(tok.tokenize('yachting').get_list()), + sorted(['yacht', 'ing']), + ) + self.assertEqual( + sorted(tok.tokenize('caterpillars').get_list()), + sorted(['c', 'at', 'erp', 'ill', 'ars']), + ) + if not PY2: + self.assertEqual( + sorted(tok.tokenize('Götterdämmerung').get_list()), + sorted(['G', 'ött', 'erd', 'ämm', 'er', 'ung']), + ) + + tok = VCClusterTokenizer(consonants='ptkbdgmn', vowels='aeiouwy') + self.assertEqual( + sorted(tok.tokenize('#winning #losing').get_list()), + sorted(['#', 'winn', 'ing', '#', 'losing']), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_whitespace.py b/tests/tokenizer/test_tokenizer_whitespace.py new file mode 100644 index 000000000..325ba7e34 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_whitespace.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import WhitespaceTokenizer + + +class WhitespaceTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.WhitespaceTokenizer.""" + + def test_whitespace_tokenizer(self): + """Test abydos.tokenizer.WhitespaceTokenizer.""" + self.assertEqual( + sorted(WhitespaceTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(WhitespaceTokenizer().tokenize('a').get_list()), ['a'] + ) + + self.assertEqual( + sorted(WhitespaceTokenizer().tokenize('NELSON').get_list()), + sorted(['NELSON']), + ) + self.assertEqual( + sorted(WhitespaceTokenizer().tokenize('NEILSEN').get_list()), + sorted(['NEILSEN']), + ) + + tweet = 'Good to be home for a night. Even better to see the\ + @chicagobulls start the season off right! #SeeRed' + self.assertEqual( + sorted(WhitespaceTokenizer().tokenize(tweet).get_list()), + sorted( + [ + 'Good', + 'to', + 'be', + 'home', + 'for', + 'a', + 'night.', + 'Even', + 'better', + 'to', + 'see', + 'the', + '@chicagobulls', + 'start', + 'the', + 'season', + 'off', + 'right!', + '#SeeRed', + ] + ), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenizer/test_tokenizer_wordpunct.py b/tests/tokenizer/test_tokenizer_wordpunct.py new file mode 100644 index 000000000..c0ab86499 --- /dev/null +++ b/tests/tokenizer/test_tokenizer_wordpunct.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.tokenizer.test_tokenizer_qgrams. + +This module contains unit tests for abydos.tokenizer.QGrams +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.tokenizer import WordpunctTokenizer + + +class WordpunctTokenizerTestCases(unittest.TestCase): + """Test abydos.tokenizer.WordpunctTokenizer.""" + + def test_wordpunct_tokenizer(self): + """Test abydos.tokenizer.WordpunctTokenizer.""" + self.assertEqual( + sorted(WordpunctTokenizer().tokenize('').get_list()), [] + ) + self.assertEqual( + sorted(WordpunctTokenizer().tokenize('a').get_list()), ['a'] + ) + + self.assertEqual( + sorted(WordpunctTokenizer().tokenize('NELSON').get_list()), + sorted(['NELSON']), + ) + self.assertEqual( + sorted(WordpunctTokenizer().tokenize('NEILSEN').get_list()), + sorted(['NEILSEN']), + ) + + tweet = 'I got a chance to catch up with the @Space_Station crew\ + today. Nothing like a call to space on #AstronomyNight!' + self.assertEqual( + sorted(WordpunctTokenizer().tokenize(tweet).get_list()), + sorted( + [ + 'I', + 'got', + 'a', + 'chance', + 'to', + 'catch', + 'up', + 'with', + 'the', + '@', + 'Space_Station', + 'crew', + 'today', + '.', + 'Nothing', + 'like', + 'a', + 'call', + 'to', + 'space', + 'on', + '#', + 'AstronomyNight', + '!', + ] + ), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/util/test_data.py b/tests/util/test_data.py new file mode 100644 index 000000000..f70a5cc16 --- /dev/null +++ b/tests/util/test_data.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.util.test_prod. + +This module contains unit tests for abydos.util._prod +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import shutil +import tempfile +import unittest + +from abydos.util._data import ( + download_package, + list_available_packages, + list_installed_packages, + package_path, +) + +from six import PY2 + + +class DataTestCases(unittest.TestCase): + """Test cases for abydos.util._prod.""" + + DEFAULT_URL = 'https://raw.githubusercontent.com/chrislit/' + DEFAULT_URL += 'abydos-data/master/index.xml' + + def test_data(self): + """Test abydos.util._data.""" + if PY2: # disable testing in Py2.7; the pickled data isn't supported + return + + self.assertTrue(isinstance(list_installed_packages(), list)) + self.assertTrue(isinstance(list_available_packages(), tuple)) + self.assertTrue( + isinstance(list_available_packages(url=self.DEFAULT_URL), tuple) + ) + + download_package('all') + self.assertEqual( + package_path('wikitext_qgram')[-14:], 'wikitext_qgram' + ) + with self.assertRaises(FileNotFoundError): + package_path('not_a_real_package') + + temppath = tempfile.mkdtemp() + download_package('wikitext_qgram', data_path=temppath, force=True) + download_package('wikitext_qgram', data_path=temppath) + shutil.rmtree(temppath) + + with self.assertRaises(ValueError): + list_available_packages(url='file:///etc/passwd') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/util/test_ncr.py b/tests/util/test_ncr.py new file mode 100644 index 000000000..b76f75b22 --- /dev/null +++ b/tests/util/test_ncr.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 by Christopher C. Little. +# This file is part of Abydos. +# +# Abydos is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Abydos is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Abydos. If not, see . + +"""abydos.tests.util.test_ncr. + +This module contains unit tests for abydos.util._ncr +""" + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import unittest + +from abydos.util._ncr import _ncr + + +class ProdTestCases(unittest.TestCase): + """Test cases for abydos.util._ncr.""" + + def test_ncr(self): + """Test abydos.util._ncr.""" + self.assertEqual(_ncr(1, 0), 1) + self.assertEqual(_ncr(5, 0), 1) + + self.assertEqual(_ncr(1, 2), 0) + self.assertEqual(_ncr(1, 2), 0) + + self.assertEqual(_ncr(2, 2), 1) + self.assertEqual(_ncr(10, 10), 1) + + self.assertEqual(_ncr(7, 2), 21) + self.assertEqual(_ncr(7, 3), 35) + self.assertEqual(_ncr(4, 3), 4) + self.assertEqual(_ncr(5, 3), 10) + self.assertEqual(_ncr(10, 2), 45) + self.assertEqual(_ncr(100, 3), 161700) + self.assertEqual(_ncr(80, 5), 24040016) + + # gamma variant + self.assertAlmostEqual(_ncr(10, 2.5), 77.8023559942) + self.assertAlmostEqual(_ncr(0, 2.5), 0.12732395447) + self.assertAlmostEqual(_ncr(2.5, 2.5), 1) + self.assertAlmostEqual(_ncr(2.5, 2.1), 1.7043970865) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/util/test_prod.py b/tests/util/test_prod.py index 76736148a..ec50dea5f 100644 --- a/tests/util/test_prod.py +++ b/tests/util/test_prod.py @@ -18,7 +18,7 @@ """abydos.tests.util.test_prod. -This module contains unit tests for abydos.util.prod +This module contains unit tests for abydos.util._prod """ from __future__ import ( diff --git a/tox.ini b/tox.ini index 96b510690..c3893148d 100644 --- a/tox.ini +++ b/tox.ini @@ -2,31 +2,38 @@ envlist = black py27 - py36 + py37 doctest py27-regression - py36-regression + py37-regression py27-fuzz - py36-fuzz + py37-fuzz pylint pydocstyle flake8 doc8 - badges docs + badges dist +# trick to enable pre-installation of Cython +indexserver = + preinstall = https://pypi.python.org/simple [testenv] deps = + :preinstall: cython nose coverage py27: pyliblzma + lzss + py36,py37,doctest: paq + nltk + syllabipy + scipy commands = nosetests [] [testenv:doctest] -basepython = python3.6 -deps = - nose +basepython = python3.7 setenv = NOSE_WITH_COVERAGE=0 NOSE_WITH_DOCTEST=1 @@ -36,7 +43,9 @@ passenv = commands = nosetests --where={toxinidir}/abydos [] [testenv:regression] -deps = nose +deps = + nose + deprecation commands = nosetests {toxinidir}/tests/regression --processes=-1 \ --process-timeout=1200 --process-restartworker -c=0 -v [] @@ -44,7 +53,7 @@ commands = nosetests {toxinidir}/tests/regression --processes=-1 \ deps = {[testenv:regression]deps} commands = {[testenv:regression]commands} -[testenv:py36-regression] +[testenv:py37-regression] deps = {[testenv:regression]deps} commands = {[testenv:regression]commands} @@ -57,32 +66,32 @@ commands = nosetests {toxinidir}/tests/fuzz --processes=-1 \ deps = {[testenv:fuzz]deps} commands = {[testenv:fuzz]commands} -[testenv:py36-fuzz] +[testenv:py37-fuzz] deps = {[testenv:fuzz]deps} commands = {[testenv:fuzz]commands} [testenv:pylint] -basepython = python3.6 +basepython = python3.7 skip_install = true deps = pylint commands = {toxinidir}/helpers/call_and_write_log.py \ "pylint --rcfile=setup.cfg abydos" 0 [testenv:pydocstyle] -basepython = python3.6 +basepython = python3.7 skip_install = true deps = pydocstyle commands = {toxinidir}/helpers/call_and_write_log.py "pydocstyle --count ." 0 [testenv:pycodestyle] -basepython = python3.6 +basepython = python3.7 skip_install = true deps = pycodestyle commands = {toxinidir}/helpers/call_and_write_log.py "pycodestyle ." 0 [testenv:flake8] -basepython = python3.6 +basepython = python3.7 skip_install = true deps = flake8 @@ -113,14 +122,14 @@ commands = {toxinidir}/helpers/call_and_write_log.py \ "flake8 {toxinidir} --htmldir={toxinidir}/flake8" 0 [testenv:doc8] -basepython = python3.6 +basepython = python3.7 skip_install = true deps = doc8 commands = {toxinidir}/helpers/call_and_write_log.py "doc8 {toxinidir}" 0 [testenv:docs] changedir = docs -basepython = python3.6 +basepython = python3.7 whitelist_externals=make deps = sphinx @@ -128,30 +137,30 @@ deps = sphinx_rtd_theme numpy commands = - sphinx-apidoc -e -f -M -o . ../abydos - sphinx-build -b html -d {envtmpdir}/doctrees . _build/html - sphinx-build -b epub -d {envtmpdir}/doctrees . _build/epub - sphinx-build -b latex -d {envtmpdir}/doctrees \ + sphinx-apidoc -e -M -o . ../abydos + sphinx-build -j 8 -b html -d {envtmpdir}/doctrees . _build/html + sphinx-build -j 8 -b epub -d {envtmpdir}/doctrees . _build/epub + sphinx-build -j 8 -b latex -d {envtmpdir}/doctrees \ -D latex_elements.papersize=letter . _build/latex make PDFLATEX=xelatex -C _build/latex all-pdf make PDFLATEX=xelatex -C _build/latex all-pdf - sphinx-build -b coverage -d {envtmpdir}/doctrees . _build/coverage + sphinx-build -j 8 -b coverage -d {envtmpdir}/doctrees . _build/coverage [testenv:badges] -basepython = python3.6 +basepython = python3.7 skip_install = true commands = python {toxinidir}/badge_update.py [testenv:black] changedir = {toxinidir} -basepython = python3.6 +basepython = python3.7 skip_install = true deps = black commands = black . [testenv:dist] changedir = {toxinidir} -basepython = python3.6 +basepython = python3.7 skip_install = true commands = python setup.py sdist