Merge pull request #514 from jamesmartini/develop_quality512

Implemented Hertz-Stormo alignment quality metric and testing
cogent3 · Jan 31, 2020 · 70ba904 · 70ba904
2 parents 5d72736 + 1f03b8f
commit 70ba904
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 2 deletions.
diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py
@@ -64,6 +64,7 @@
 from cogent3.format.nexus import nexus_from_alignment
 from cogent3.format.phylip import alignment_to_phylip
 from cogent3.maths.stats.number import CategoryCounter
+from cogent3.maths.util import safe_log
 from cogent3.parse.gff import gff_parser
 from cogent3.util import progress_display as UI
 from cogent3.util.dict_array import DictArrayTemplate
@@ -2235,6 +2236,39 @@ class AlignmentI(object):
     default_gap = "-"  # default gap character for padding
     gap_chars = dict.fromkeys("-?")  # default gap chars for comparisons
 
+    def alignment_quality(self, equifreq_mprobs=True):
+        """
+        Computes the alignment quality for an alignment based on eq. (2) in noted reference.
+
+        Parameters
+        ----------
+        equifreq_mprobs : bool
+            If true, specifies equally frequent motif probabilities.
+
+        Notes
+        -----
+        G. Z. Hertz, G. D. Stormo - Published 1999, Bioinformatics, vol. 15 pg. 563-577.
+
+        The alignment quality statistic is a log-likelihood ratio (computed using log2)
+        of the observed alignment column freqs versus the expected.
+        """
+        counts = self.counts_per_pos()
+        if counts.array.max() == 0 or len(self.seqs) == 1:
+            return None
+        if equifreq_mprobs:
+            num_motifs = len(counts.motifs)
+            p = array([1 / num_motifs] * num_motifs)
+        else:
+            motif_probs = self.get_motif_probs()
+            p = array([motif_probs[b] for b in counts.motifs])
+        cols = p != 0
+        p = p[cols]
+        counts = counts.array[:, cols]
+        frequency = counts / self.num_seqs
+        log_f = safe_log(frequency / p)
+        I_seq = log_f * frequency
+        return I_seq.sum()
+
     def iter_positions(self, pos_order=None):
         """Iterates over positions in the alignment, in order.
 

diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py
@@ -10,10 +10,10 @@
 
 import numpy
 
-from numpy import arange, array, nan, transpose
+from numpy import arange, array, nan, transpose, log2
 from numpy.testing import assert_allclose
 
-from cogent3 import load_aligned_seqs, load_unaligned_seqs, make_seq
+from cogent3 import load_aligned_seqs, load_unaligned_seqs, make_seq, make_aligned_seqs
 from cogent3.core.alignment import (
     Aligned,
     Alignment,
@@ -1530,6 +1530,51 @@ class AlignmentBaseTests(SequenceCollectionBaseTests):
     as a constructor.
     """
 
+    def test_alignment_quality(self):
+        """Tests that the alignment_quality generates the right alignment quality
+        value based on the Hertz-Stormo metric. expected values are hand calculated
+        using the formula in the paper."""
+        aln = make_aligned_seqs(["AATTGA",
+                                 "AGGTCC",
+                                 "AGGATG",
+                                 "AGGCGT"], moltype="dna")
+        got = aln.alignment_quality(equifreq_mprobs=True)
+        expect = log2(4) + (3 / 2) * log2(3) + (1 / 2) * log2(2) + (1 / 2) * log2(2)
+        assert_allclose(got, expect)
+
+        aln = make_aligned_seqs(["AAAC",
+                                 "ACGC",
+                                 "AGCC",
+                                 "A-TC"], moltype="dna")
+        got = aln.alignment_quality(equifreq_mprobs=False)
+        expect = 2 * log2(1 / 0.4) + log2(1 / (4 * 0.4)) + (1 / 2) * log2(1 / (8 / 15)) + (
+                1 / 4) * log2(1 / (4 / 15))
+        assert_allclose(got, expect)
+
+        #1. Alignment just gaps (Gap chars need to be fixed for unspecified moltype, before uncommenting).
+        # aln = make_aligned_seqs(["----"])
+        # got = aln.alignment_quality(equifreq_mprobs=True)
+        # assert_allclose(got, 0)
+
+        #2 Just one sequence (I've made an assumption that if there is one sequence,
+        # the alignment quality should also return None, correct me if I'm wrong).
+        aln = make_aligned_seqs(["AAAC"])
+        got = aln.alignment_quality(equifreq_mprobs=True)
+        assert got is None
+
+        #3.1 Two seqs, one all gaps. (equifreq_mprobs=True)
+        aln = make_aligned_seqs(["----",
+                                "ACAT"])
+        got = aln.alignment_quality(equifreq_mprobs=True)
+        assert_allclose(got, 28)
+
+
+        #3.2 Two seqs, one all gaps. (equifreq_mprobs=False)
+        aln = make_aligned_seqs(["----",
+                                "AAAA"])
+        got = aln.alignment_quality(equifreq_mprobs=False)
+        assert_allclose(got, -2)
+
     def make_and_filter(self, raw, expected, motif_length, drop_remainder):
         # a simple filter func
         aln = self.Class(raw, info={"key": "value"})