Skip to content

Commit

Permalink
corrected bz2 algorithm & added to doctests
Browse files Browse the repository at this point in the history
  • Loading branch information
chrislit committed Oct 22, 2018
1 parent 474bc8a commit 5fcdb58
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 9 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Abydos
:target: https://app.fossa.io/projects/git%2Bgithub.com%2Fchrislit%2Fabydos?ref=badge_shield
:alt: FOSSA Status

.. |pylint| image:: https://img.shields.io/badge/Pylint-9.45/10-yellowgreen.svg
.. |pylint| image:: https://img.shields.io/badge/Pylint-9.44/10-yellowgreen.svg
:target: #
:alt: Pylint Score

Expand Down
76 changes: 68 additions & 8 deletions abydos/distance/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,14 @@ def dist_ncd_arith(src, tar, probs=None):
:returns: compression distance
:rtype: float
>>> dist_ncd_arith('cat', 'hat')
0.5454545454545454
>>> dist_ncd_arith('Niall', 'Neil')
0.6875
>>> dist_ncd_arith('aluminum', 'Catalan')
0.8275862068965517
>>> dist_ncd_arith('ATCG', 'TAGC')
0.6923076923076923
"""
if src == tar:
return 0.0
Expand Down Expand Up @@ -83,8 +89,14 @@ def sim_ncd_arith(src, tar, probs=None):
:returns: compression similarity
:rtype: float
>>> sim_ncd_arith('cat', 'hat')
0.4545454545454546
>>> sim_ncd_arith('Niall', 'Neil')
0.3125
>>> sim_ncd_arith('aluminum', 'Catalan')
0.1724137931034483
>>> sim_ncd_arith('ATCG', 'TAGC')
0.3076923076923077
"""
return 1-dist_ncd_arith(src, tar, probs)

Expand All @@ -101,8 +113,14 @@ def dist_ncd_rle(src, tar, use_bwt=False):
:returns: compression distance
:rtype: float
>>> dist_ncd_rle('cat', 'hat')
1.0
>>> dist_ncd_rle('Niall', 'Neil')
1.0
>>> dist_ncd_rle('aluminum', 'Catalan')
1.0
>>> dist_ncd_rle('ATCG', 'TAGC')
1.0
"""
if src == tar:
return 0.0
Expand All @@ -129,8 +147,14 @@ def sim_ncd_rle(src, tar, use_bwt=False):
:returns: compression similarity
:rtype: float
>>> sim_ncd_rle('cat', 'hat')
0.0
>>> sim_ncd_rle('Niall', 'Neil')
0.0
>>> sim_ncd_rle('aluminum', 'Catalan')
0.0
>>> sim_ncd_rle('ATCG', 'TAGC')
0.0
"""
return 1 - dist_ncd_rle(src, tar, use_bwt)

Expand All @@ -145,8 +169,14 @@ def dist_ncd_bwtrle(src, tar):
:returns: compression distance
:rtype: float
>>> dist_ncd_bwtrle('cat', 'hat')
0.75
>>> dist_ncd_bwtrle('Niall', 'Neil')
0.8333333333333334
>>> dist_ncd_bwtrle('aluminum', 'Catalan')
1.0
>>> dist_ncd_bwtrle('ATCG', 'TAGC')
0.8
"""
return dist_ncd_rle(src, tar, True)

Expand All @@ -161,8 +191,14 @@ def sim_ncd_bwtrle(src, tar):
:returns: compression similarity
:rtype: float
>>> sim_ncd_bwtrle('cat', 'hat')
0.25
>>> sim_ncd_bwtrle('Niall', 'Neil')
0.16666666666666663
>>> sim_ncd_bwtrle('aluminum', 'Catalan')
0.0
>>> sim_ncd_bwtrle('ATCG', 'TAGC')
0.19999999999999996
"""
return 1 - dist_ncd_bwtrle(src, tar)

Expand All @@ -177,8 +213,14 @@ def dist_ncd_zlib(src, tar):
:returns: compression distance
:rtype: float
>>> dist_ncd_zlib('cat', 'hat')
0.3333333333333333
>>> dist_ncd_zlib('Niall', 'Neil')
0.45454545454545453
>>> dist_ncd_zlib('aluminum', 'Catalan')
0.5714285714285714
>>> dist_ncd_zlib('ATCG', 'TAGC')
0.4
"""
if src == tar:
return 0.0
Expand Down Expand Up @@ -206,8 +248,14 @@ def sim_ncd_zlib(src, tar):
:returns: compression similarity
:rtype: float
>>> sim_ncd_zlib('cat', 'hat')
0.6666666666666667
>>> sim_ncd_zlib('Niall', 'Neil')
0.5454545454545454
>>> sim_ncd_zlib('aluminum', 'Catalan')
0.4285714285714286
>>> sim_ncd_zlib('ATCG', 'TAGC')
0.6
"""
return 1 - dist_ncd_zlib(src, tar)

Expand All @@ -224,23 +272,23 @@ def dist_ncd_bz2(src, tar):
>>> dist_ncd_bz2('cat', 'hat')
0.08
>>> dist_ncd_bz2('Niall', 'Neil')
0.037037037037037035
>>> dist_ncd_bz2('aluminum', 'Catalan')
0.20689655172413793
>>> dist_ncd_bz2('ATCG', 'TAGC')
0.037037037037037035
>>> dist_ncd_bz2('Niall', 'Neil')
0.037037037037037035
"""
if src == tar:
return 0.0

src = src.encode('utf-8')
tar = tar.encode('utf-8')

src_comp = encode(src, 'bz2_codec')[2:]
tar_comp = encode(tar, 'bz2_codec')[2:]
concat_comp = encode(src + tar, 'bz2_codec')[2:]
concat_comp2 = encode(tar + src, 'bz2_codec')[2:]
src_comp = encode(src, 'bz2_codec')[15:]
tar_comp = encode(tar, 'bz2_codec')[15:]
concat_comp = encode(src + tar, 'bz2_codec')[15:]
concat_comp2 = encode(tar + src, 'bz2_codec')[15:]

return ((min(len(concat_comp), len(concat_comp2)) -
min(len(src_comp), len(tar_comp))) /
Expand All @@ -259,12 +307,12 @@ def sim_ncd_bz2(src, tar):
>>> sim_ncd_bz2('cat', 'hat')
0.92
>>> sim_ncd_bz2('Niall', 'Neil')
0.962962962962963
>>> sim_ncd_bz2('aluminum', 'Catalan')
0.7931034482758621
>>> sim_ncd_bz2('ATCG', 'TAGC')
0.962962962962963
>>> sim_ncd_bz2('Niall', 'Neil')
0.962962962962963
"""
return 1 - dist_ncd_bz2(src, tar)

Expand All @@ -279,8 +327,14 @@ def dist_ncd_lzma(src, tar):
:returns: compression distance
:rtype: float
>>> dist_ncd_lzma('cat', 'hat')
0.08695652173913043
>>> dist_ncd_lzma('Niall', 'Neil')
0.16
>>> dist_ncd_lzma('aluminum', 'Catalan')
0.16
>>> dist_ncd_lzma('ATCG', 'TAGC')
0.08695652173913043
"""
if src == tar:
return 0.0
Expand Down Expand Up @@ -312,8 +366,14 @@ def sim_ncd_lzma(src, tar):
:returns: compression similarity
:rtype: float
>>> sim_ncd_lzma('cat', 'hat')
0.9130434782608696
>>> sim_ncd_lzma('Niall', 'Neil')
0.84
>>> sim_ncd_lzma('aluminum', 'Catalan')
0.84
>>> sim_ncd_lzma('ATCG', 'TAGC')
0.9130434782608696
"""
return 1 - dist_ncd_lzma(src, tar)

Expand Down

0 comments on commit 5fcdb58

Please sign in to comment.