Skip to content

Commit

Permalink
Merge pull request #458 from jamesmartini/develop_regex
Browse files Browse the repository at this point in the history
Created annotate_matches_to, and associated test methods
  • Loading branch information
GavinHuttley committed Jan 2, 2020
2 parents de5cbac + af4d0a4 commit 42b65ee
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 22 deletions.
1 change: 1 addition & 0 deletions src/cogent3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
SequenceCollection,
)
from cogent3.core.genetic_code import available_codes, get_code

# note that moltype has to be imported last, because it sets the moltype in
# the objects created by the other modules.
from cogent3.core.moltype import (
Expand Down
55 changes: 33 additions & 22 deletions src/cogent3/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1917,25 +1917,25 @@ def entropy_per_seq(
exclude_unobserved=True,
alert=False,
):
"""returns the Shannon entropy per sequence
"""Returns the Shannon entropy per sequence.
Parameters
----------
motif_length
number of characters per tuple.
include_ambiguity
if True, motifs containing ambiguous characters
from the seq moltype are included. No expansion of those is attempted.
allow_gap
if True, motifs containing a gap character are included.
exclude_unobserved
if True, unobserved motif combinations are excluded.
Parameters
----------
motif_length: int
number of characters per tuple.
include_ambiguity: bool
if True, motifs containing ambiguous characters
from the seq moltype are included. No expansion of those is attempted.
allow_gap: bool
if True, motifs containing a gap character are included.
exclude_unobserved: bool
if True, unobserved motif combinations are excluded.
Notes
-----
For motif_length > 1, it's advisable to specify exclude_unobserved=True,
this avoids unnecessary calculations.
"""
Notes
-----
For motif_length > 1, it's advisable to specify exclude_unobserved=True,
this avoids unnecessary calculations.
"""
probs = self.probs_per_seq(
motif_length=motif_length,
include_ambiguity=include_ambiguity,
Expand Down Expand Up @@ -2013,6 +2013,14 @@ def __init__(self, map, data, length=None):
if hasattr(data, "name"):
self.name = data.name

def annotate_matches_to(self, pattern, annot_type, name, allow_multiple=False):
return self.data.annotate_matches_to(
pattern=pattern,
annot_type=annot_type,
name=name,
allow_multiple=allow_multiple,
)

def _get_moltype(self):
return self.data.moltype

Expand Down Expand Up @@ -2333,6 +2341,7 @@ def probs_per_seq(
exclude_unobserved=False,
alert=False,
):

"""return MotifFreqsArray per sequence
Parameters
Expand All @@ -2347,6 +2356,7 @@ def probs_per_seq(
exclude_unobserved
if True, unobserved motif combinations are excluded.
"""

counts = self.counts_per_seq(
motif_length=motif_length,
include_ambiguity=include_ambiguity,
Expand Down Expand Up @@ -2380,11 +2390,12 @@ def entropy_per_seq(
exclude_unobserved
if True, unobserved motif combinations are excluded.
Notes
-----
For motif_length > 1, it's advisable to specify exclude_unobserved=True,
this avoids unnecessary calculations.
"""
Notes
-----
For motif_length > 1, it's advisable to specify exclude_unobserved=True,
this avoids unnecessary calculations.
"""

probs = self.probs_per_seq(
motif_length=motif_length,
include_ambiguity=include_ambiguity,
Expand Down
36 changes: 36 additions & 0 deletions src/cogent3/core/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,42 @@ def __str__(self):
"""__str__ returns self._seq unmodified."""
return self._seq

def annotate_matches_to(self, pattern, annot_type, name, allow_multiple=False):
"""
Adds an annotation at the specified pattern in a sequence.
The pattern allows for IUPAC ambiguities,
as they are converted to regex.
Parameters
----------
pattern : string
The search string for which annotations are made.
annot_type : string
The type of the annotation (e.g. exon).
name : string
The name of the annotation.
allow_multiple : boolean
If True, checks for multiple occurences of the input pattern.
Returns
-------
Returns a list of Annotation instances.
"""
pattern = self.moltype.to_regex(seq=pattern)
pos = [m.span() for m in re.finditer(pattern, self._seq)]
if not pos:
return []
annot = []
if allow_multiple:
for i in range(0, len(pos)):
annot.append(
self.add_feature(annot_type, f"{name}:{i}", [pos[i]])
)
else:
pos = pos[:1]
annot.append(self.add_feature(annot_type, name, pos))
return annot

def to_fasta(self, make_seqlabel=None, block_size=60):
"""Return string of self in FASTA format, no trailing newline
Expand Down
28 changes: 28 additions & 0 deletions tests/test_core/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2571,6 +2571,34 @@ def test_add_from_ref_aln(self):
ValueError, aln1.add_from_ref_aln, aln2_wrong_refseq
) # test wrong_refseq

def test_annotate_matches_to(self):
"""Aligned. annotate_matches_to correctly delegates to sequence"""
aln = Alignment(dict(x="TTCCACTTCCGCTT"), moltype="dna")
seq = aln.named_seqs["x"]
pattern = "CCRC"
annot = seq.annotate_matches_to(
pattern=pattern,
annot_type="domain",
name="fred",
allow_multiple=True,
)
regular_expression = DNA.to_regex(seq=pattern)
for i in range(0, len(annot)):
fred = annot[i].get_slice()
self.assertEqual(
str(fred), re.search(regular_expression, str(fred)).group()
)
annot = seq.annotate_matches_to(
pattern=pattern,
annot_type="domain",
name="fred",
allow_multiple=False,
)
fred = annot[0].get_slice()
self.assertEqual(
str(fred), re.search(regular_expression, str(fred)).group()
)

def test_deepcopy(self):
"""correctly deep copy aligned objects in an alignment"""
path = "data/brca1_5.paml"
Expand Down
36 changes: 36 additions & 0 deletions tests/test_core/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,42 @@ class SequenceTests(TestCase):
DNA = DnaSequence
PROT = ProteinSequence

def test_annotate_matches_to(self):
"""
annotate_matches_to method should attach
annotations correctly to a Sequence object, tested
for both multiple and singular annotations.
For Sequence objects of MolType
ASCII, annotate_matches_to should return an empty annotation.
"""
seq = self.DNA("TTCCACTTCCGCTT", name="x")
pattern = "CCRC"
annot = seq.annotate_matches_to(
pattern=pattern,
annot_type="domain",
name="fred",
allow_multiple=True,
)
self.assertEqual([a.get_slice() for a in annot], ["CCAC", "CCGC"])
annot = seq.annotate_matches_to(
pattern=pattern,
annot_type="domain",
name="fred",
allow_multiple=False,
)
fred = annot[0].get_slice()[0:len(pattern)]
self.assertEqual(len(annot), 1)
self.assertEqual(str(fred), "CCAC")
seq = ASCII.make_seq(seq="TTCCACTTCCGCTT")
annot = seq.annotate_matches_to(
pattern=pattern,
annot_type="domain",
name="fred",
allow_multiple=False,
)
self.assertEqual(annot, [])


def test_init_empty(self):
"""Sequence and subclasses should init correctly."""
# NOTE: ModelSequences can't be initialized empty because it screws up
Expand Down

0 comments on commit 42b65ee

Please sign in to comment.