Skip to content

Commit

Permalink
Merge pull request #536 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
Maintenance
  • Loading branch information
GavinHuttley committed Feb 16, 2020
2 parents 7bd5466 + 96ab670 commit 921c34e
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 48 deletions.
13 changes: 1 addition & 12 deletions src/cogent3/core/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

from numpy import array, digitize
from numpy.random import random
from numpy.testing import assert_allclose

from cogent3.maths.util import safe_log, safe_p_log_p
from cogent3.maths.util import safe_log, safe_p_log_p, validate_freqs_array
from cogent3.util.dict_array import DictArray, DictArrayTemplate
from cogent3.util.misc import extend_docstring_from

Expand All @@ -19,16 +18,6 @@
__status__ = "Production"


def validate_freqs_array(data, axis=None):
if (data < 0).any():
raise ValueError("negative frequency not allowed")

# we explicitly ignore nan
result = data.sum(axis=axis)
if not numpy.allclose(result[numpy.isnan(result) == False], 1):
raise ValueError("invalid frequencies, sum(axis=1) is not equal to 1")


class _MotifNumberArray(DictArray):
def __init__(self, data, motifs, row_indices=None, dtype=None):
"""
Expand Down
17 changes: 9 additions & 8 deletions src/cogent3/maths/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import cogent3.util.misc

from cogent3.maths.util import safe_p_log_p
from cogent3.maths.util import safe_p_log_p, validate_freqs_array

from .util import safe_log

Expand Down Expand Up @@ -88,18 +88,16 @@ def paralinear_continuous_time(P, pi, Q, validate=False):


def jsd(freqs1, freqs2, validate=False):
"""
"""calculate Jensen–Shannon divergence between two probability distributions
Parameters
----------
freqs1 : one dimensional array
row vector frequencies, sum to 1
freqs2 : one dimensional array
row vector frequencies, sum to 1
validate : bool
Returns
-------
the mathematical calculation of Jensen–Shannon divergence
between two probability distributions
"""
# Convert input arrays into numpy arrays
freqs1 = array(freqs1)
Expand All @@ -111,8 +109,11 @@ def jsd(freqs1, freqs2, validate=False):
)
assert freqs1.ndim == 1, "freqs1 has incorrect dimension"
assert freqs2.ndim == 1, "freqs2 has incorrect dimension"
assert_allclose(sum(freqs1), 1, err_msg="invalid freqs1")
assert_allclose(sum(freqs2), 1, err_msg="invalid freqs2")
try:
validate_freqs_array(freqs1)
validate_freqs_array(freqs2)
except ValueError as err:
raise AssertionError("freqs not valid") from err

H_mn = safe_p_log_p(freqs1 / 2 + freqs2 / 2).sum()
mn_H = sum([sum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2
Expand Down
22 changes: 22 additions & 0 deletions src/cogent3/maths/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,25 @@ def column_degeneracy(a, cutoff=0.5):
# degen contains now the indices at which the cutoff was hit
# to change to the number of characters, add 1
return clip(array(degen) + 1, 0, a.shape[0])


def validate_freqs_array(data, axis=None):
"""input data is a valid frequency array
Parameters
----------
data : ndarray
numpy array
axis : int or None
indicates along which axis the array should sum to 1
Notes
-----
Raises ValueError if any element < 0 or series do not sum to 1
"""
if (data < 0).any():
raise ValueError("negative frequency not allowed")

# we explicitly ignore nan
result = data.sum(axis=axis)
if not numpy.allclose(result[numpy.isnan(result) == False], 1):
raise ValueError("invalid frequencies, sum(axis=1) is not equal to 1")
56 changes: 28 additions & 28 deletions tests/test_maths/test_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,37 +166,37 @@ def test_jsd_validation(self):
def test_jsd(self):
"""case1 is testing if the jsd between two identical distributions is 0.0"""
case1 = [
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
]
for pointer in range(10):
case1[0][pointer] = 1.0
case1[1][pointer] = 1.0
for index in range(len(case1[0])):
case1[0][index] = 1.0
case1[1][index] = 1.0
assert_allclose(
jsd(case1[0], case1[1], validate=True),
0.0,
err_msg="Testing case1 for jsd failed",
)
case1[0][pointer] = 0.0
case1[1][pointer] = 0.0
"""case2 is testing the numerical output of jsd between two random distributions"""
case2 = [[1.0 / 10, 9.0 / 10, 0], [0, 1.0 / 10, 9.0 / 10]]
case1[0][index] = 0.0
case1[1][index] = 0.0
# case2 is testing the numerical output of jsd between two distant distributions
case2 = [[1 / 10, 9 / 10, 0], [0, 1 / 10, 9 / 10]]
assert_allclose(
jsd(case2[0], case2[1], validate=True),
0.7655022032053593,
err_msg="Testing case2 for jsd failed",
)
"""case3 is testing the numerical output of jsd between two random distributions"""
case3 = [[1.0, 0.0], [0.5, 0.5]]
# case3 is testing the numerical output of jsd between two distant distributions
case3 = [[1.0, 0.0], [1 / 2, 1 / 2]]
assert_allclose(
jsd(case3[0], case3[1], validate=True),
0.3112781244591328,
err_msg="Testing case3 for jsd failed",
)
"""case4 is testing if the jsd between two identical uniform distributions is 0.0"""
# case4 - the jsd between two identical uniform distributions is 0.0
case4 = [
[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
[1 / 10] * 10,
[1 / 10] * 10,
]
assert_allclose(
jsd(case4[0], case4[1], validate=True),
Expand All @@ -207,37 +207,37 @@ def test_jsd(self):
def test_jsm(self):
"""case1 is testing if the jsm between two identical distributions is 0.0"""
case1 = [
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
]
for pointer in range(10):
case1[0][pointer] = 1.0
case1[1][pointer] = 1.0
for index in range(len(case1[0])):
case1[0][index] = 1.0
case1[1][index] = 1.0
assert_allclose(
jsm(case1[0], case1[1], validate=True),
0.0,
err_msg="Testing case1 for jsm failed",
)
case1[0][pointer] = 0.0
case1[1][pointer] = 0.0
"""case2 is testing the numerical output of jsm between two random distributions"""
case2 = [[1.0 / 10, 9.0 / 10, 0], [0, 1.0 / 10, 9.0 / 10]]
case1[0][index] = 0.0
case1[1][index] = 0.0
# case2 is testing the numerical output of jsm between two random distributions
case2 = [[1 / 10, 9 / 10, 0], [0, 1 / 10, 9 / 10]]
assert_allclose(
jsm(case2[0], case2[1], validate=True),
0.8749298275892526,
err_msg="Testing case2 for jsm failed",
)
"""case3 is testing the numerical output of jsm between two random distributions"""
case3 = [[1.0, 0.0], [0.5, 0.5]]
# case3 is testing the numerical output of jsm between two random distributions
case3 = [[1.0, 0.0], [1 / 2, 1 / 2]]
assert_allclose(
jsm(case3[0], case3[1], validate=True),
0.5579230452841438,
err_msg="Testing case3 for jsm failed",
)
"""case4 is testing if the jsm between two identical uniform distributions is 0.0"""
# case4 is testing if the jsm between two identical uniform distributions is 0.0
case4 = [
[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
[1 / 10] * 10,
[1 / 10] * 10,
]
assert_allclose(
jsm(case4[0], case4[1], validate=True),
Expand Down

0 comments on commit 921c34e

Please sign in to comment.