Skip to content

Commit

Permalink
Merge pull request #438 from jamesmartini/develop_testmethods
Browse files Browse the repository at this point in the history
Pull request for issue 'Changes affecting entropy_per_seq #433'.
  • Loading branch information
GavinHuttley committed Dec 6, 2019
2 parents 9e0ce5c + 8d3970c commit 187fcc8
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 0 deletions.
51 changes: 51 additions & 0 deletions src/cogent3/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1871,6 +1871,57 @@ def set_repr_policy(self, num_seqs=None, num_pos=None):
assert isinstance(num_pos, int), "num_pos is not an integer"
self._repr_policy["num_pos"] = num_pos

def probs_per_seq(self, motif_length=1,
include_ambiguity=False,
allow_gap=False,
exclude_unobserved=False,
alert=False):
"""return MotifFreqsArray per sequence"""

counts = self.counts_per_seq(
motif_length=motif_length,
include_ambiguity=include_ambiguity,
allow_gap=allow_gap,
exclude_unobserved=exclude_unobserved
)
if counts is None:
return None

return counts.to_freq_array()

def entropy_per_seq(self,
motif_length=1,
include_ambiguity=False,
allow_gap=False,
exclude_unobserved=True,
alert=False):
"""returns the Shannon entropy per sequence
Parameters
----------
motif_length
number of characters per tuple.
include_ambiguity
if True, motifs containing ambiguous characters
from the seq moltype are included. No expansion of those is attempted.
allow_gap
if True, motifs containing a gap character are included.
exclude_unobserved
if True, unobserved motif combinations are excluded.
Notes
-----
For motif_length > 1, it's advisable to specify exclude_unobserved=True,
this avoids unnecessary calculations.
"""
probs = self.probs_per_seq(motif_length=motif_length, include_ambiguity=include_ambiguity,
allow_gap=allow_gap,
exclude_unobserved=exclude_unobserved, alert=alert)
if probs is None:
return None

return probs.entropy()


class SequenceCollection(_SequenceCollectionBase):
"""Container for unaligned sequences
Expand Down
14 changes: 14 additions & 0 deletions tests/test_core/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,12 @@ def test_set_repr_policy_valid_input(self):
seqs.set_repr_policy(num_seqs=5, num_pos=40)
self.assertEqual(seqs._repr_policy, dict(num_seqs=5, num_pos=40))

def test_get_seq_entropy(self):
"""get_seq_entropy should get entropy of each seq"""
a = self.Class(dict(a="ACCC", b="AGTA"), moltype=DNA)
entropy = a.entropy_per_seq()
e = 0.81127812445913283 # sum(p log_2 p) for p = 0.25, 0.75
self.assertFloatEqual(entropy, array([e, 1.5]))

class SequenceCollectionTests(SequenceCollectionBaseTests, TestCase):
"""Tests of the SequenceCollection object. Includes ragged collection tests.
Expand Down Expand Up @@ -2358,6 +2364,14 @@ def test_info_source(self):
seqs = load_aligned_seqs("data/brca1.fasta", array_align=array_align)
self.assertEqual(seqs.info.source, "data/brca1.fasta")

def test_seq_entropy_just_gaps(self):
"""get_seq_entropy should get entropy of each seq"""
a = self.Class(dict(a="A---", b="----"), moltype=DNA)
entropy = a.entropy_per_seq()
assert_allclose(entropy, [0, numpy.nan])
a = self.Class(dict(a="----", b="----"), moltype=DNA)
entropy = a.entropy_per_seq()
self.assertIs(entropy, None)

class ArrayAlignmentTests(AlignmentBaseTests, TestCase):
Class = ArrayAlignment
Expand Down

0 comments on commit 187fcc8

Please sign in to comment.