Skip to content

Commit

Permalink
Merge pull request #514 from jamesmartini/develop_quality512
Browse files Browse the repository at this point in the history
Implemented Hertz-Stormo alignment quality metric and testing
  • Loading branch information
GavinHuttley committed Jan 31, 2020
2 parents 5d72736 + 1f03b8f commit 70ba904
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 2 deletions.
34 changes: 34 additions & 0 deletions src/cogent3/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from cogent3.format.nexus import nexus_from_alignment
from cogent3.format.phylip import alignment_to_phylip
from cogent3.maths.stats.number import CategoryCounter
from cogent3.maths.util import safe_log
from cogent3.parse.gff import gff_parser
from cogent3.util import progress_display as UI
from cogent3.util.dict_array import DictArrayTemplate
Expand Down Expand Up @@ -2235,6 +2236,39 @@ class AlignmentI(object):
default_gap = "-" # default gap character for padding
gap_chars = dict.fromkeys("-?") # default gap chars for comparisons

def alignment_quality(self, equifreq_mprobs=True):
"""
Computes the alignment quality for an alignment based on eq. (2) in noted reference.
Parameters
----------
equifreq_mprobs : bool
If true, specifies equally frequent motif probabilities.
Notes
-----
G. Z. Hertz, G. D. Stormo - Published 1999, Bioinformatics, vol. 15 pg. 563-577.
The alignment quality statistic is a log-likelihood ratio (computed using log2)
of the observed alignment column freqs versus the expected.
"""
counts = self.counts_per_pos()
if counts.array.max() == 0 or len(self.seqs) == 1:
return None
if equifreq_mprobs:
num_motifs = len(counts.motifs)
p = array([1 / num_motifs] * num_motifs)
else:
motif_probs = self.get_motif_probs()
p = array([motif_probs[b] for b in counts.motifs])
cols = p != 0
p = p[cols]
counts = counts.array[:, cols]
frequency = counts / self.num_seqs
log_f = safe_log(frequency / p)
I_seq = log_f * frequency
return I_seq.sum()

def iter_positions(self, pos_order=None):
"""Iterates over positions in the alignment, in order.
Expand Down
49 changes: 47 additions & 2 deletions tests/test_core/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

import numpy

from numpy import arange, array, nan, transpose
from numpy import arange, array, nan, transpose, log2
from numpy.testing import assert_allclose

from cogent3 import load_aligned_seqs, load_unaligned_seqs, make_seq
from cogent3 import load_aligned_seqs, load_unaligned_seqs, make_seq, make_aligned_seqs
from cogent3.core.alignment import (
Aligned,
Alignment,
Expand Down Expand Up @@ -1530,6 +1530,51 @@ class AlignmentBaseTests(SequenceCollectionBaseTests):
as a constructor.
"""

def test_alignment_quality(self):
"""Tests that the alignment_quality generates the right alignment quality
value based on the Hertz-Stormo metric. expected values are hand calculated
using the formula in the paper."""
aln = make_aligned_seqs(["AATTGA",
"AGGTCC",
"AGGATG",
"AGGCGT"], moltype="dna")
got = aln.alignment_quality(equifreq_mprobs=True)
expect = log2(4) + (3 / 2) * log2(3) + (1 / 2) * log2(2) + (1 / 2) * log2(2)
assert_allclose(got, expect)

aln = make_aligned_seqs(["AAAC",
"ACGC",
"AGCC",
"A-TC"], moltype="dna")
got = aln.alignment_quality(equifreq_mprobs=False)
expect = 2 * log2(1 / 0.4) + log2(1 / (4 * 0.4)) + (1 / 2) * log2(1 / (8 / 15)) + (
1 / 4) * log2(1 / (4 / 15))
assert_allclose(got, expect)

#1. Alignment just gaps (Gap chars need to be fixed for unspecified moltype, before uncommenting).
# aln = make_aligned_seqs(["----"])
# got = aln.alignment_quality(equifreq_mprobs=True)
# assert_allclose(got, 0)

#2 Just one sequence (I've made an assumption that if there is one sequence,
# the alignment quality should also return None, correct me if I'm wrong).
aln = make_aligned_seqs(["AAAC"])
got = aln.alignment_quality(equifreq_mprobs=True)
assert got is None

#3.1 Two seqs, one all gaps. (equifreq_mprobs=True)
aln = make_aligned_seqs(["----",
"ACAT"])
got = aln.alignment_quality(equifreq_mprobs=True)
assert_allclose(got, 28)


#3.2 Two seqs, one all gaps. (equifreq_mprobs=False)
aln = make_aligned_seqs(["----",
"AAAA"])
got = aln.alignment_quality(equifreq_mprobs=False)
assert_allclose(got, -2)

def make_and_filter(self, raw, expected, motif_length, drop_remainder):
# a simple filter func
aln = self.Class(raw, info={"key": "value"})
Expand Down

0 comments on commit 70ba904

Please sign in to comment.