Merge pull request #458 from jamesmartini/develop_regex

Created annotate_matches_to, and associated test methods
cogent3 · Jan 2, 2020 · 42b65ee · 42b65ee
2 parents de5cbac + af4d0a4
commit 42b65ee
Show file tree

Hide file tree

Showing 5 changed files with 134 additions and 22 deletions.
diff --git a/src/cogent3/__init__.py b/src/cogent3/__init__.py
@@ -18,6 +18,7 @@
     SequenceCollection,
 )
 from cogent3.core.genetic_code import available_codes, get_code
+
 # note that moltype has to be imported last, because it sets the moltype in
 # the objects created by the other modules.
 from cogent3.core.moltype import (

diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py
@@ -1917,25 +1917,25 @@ def entropy_per_seq(
         exclude_unobserved=True,
         alert=False,
     ):
-        """returns the Shannon entropy per sequence
+        """Returns the Shannon entropy per sequence.
 
-                Parameters
-                ----------
-                motif_length
-                    number of characters per tuple.
-                include_ambiguity
-                    if True, motifs containing ambiguous characters
-                    from the seq moltype are included. No expansion of those is attempted.
-                allow_gap
-                    if True, motifs containing a gap character are included.
-                exclude_unobserved
-                    if True, unobserved motif combinations are excluded.
+        Parameters
+        ----------
+        motif_length: int
+            number of characters per tuple.
+        include_ambiguity: bool
+            if True, motifs containing ambiguous characters
+            from the seq moltype are included. No expansion of those is attempted.
+        allow_gap: bool
+            if True, motifs containing a gap character are included.
+        exclude_unobserved: bool
+            if True, unobserved motif combinations are excluded.
 
-                Notes
-                -----
-                For motif_length > 1, it's advisable to specify exclude_unobserved=True,
-                this avoids unnecessary calculations.
-                """
+        Notes
+        -----
+        For motif_length > 1, it's advisable to specify exclude_unobserved=True,
+        this avoids unnecessary calculations.
+        """
         probs = self.probs_per_seq(
             motif_length=motif_length,
             include_ambiguity=include_ambiguity,
@@ -2013,6 +2013,14 @@ def __init__(self, map, data, length=None):
         if hasattr(data, "name"):
             self.name = data.name
 
+    def annotate_matches_to(self, pattern, annot_type, name, allow_multiple=False):
+        return self.data.annotate_matches_to(
+            pattern=pattern,
+            annot_type=annot_type,
+            name=name,
+            allow_multiple=allow_multiple,
+        )
+
     def _get_moltype(self):
         return self.data.moltype
 
@@ -2333,6 +2341,7 @@ def probs_per_seq(
         exclude_unobserved=False,
         alert=False,
     ):
+
         """return MotifFreqsArray per sequence
 
         Parameters
@@ -2347,6 +2356,7 @@ def probs_per_seq(
         exclude_unobserved
             if True, unobserved motif combinations are excluded.
         """
+
         counts = self.counts_per_seq(
             motif_length=motif_length,
             include_ambiguity=include_ambiguity,
@@ -2380,11 +2390,12 @@ def entropy_per_seq(
         exclude_unobserved
             if True, unobserved motif combinations are excluded.
 
-        Notes
-        -----
-        For motif_length > 1, it's advisable to specify exclude_unobserved=True,
-        this avoids unnecessary calculations.
-        """
+                Notes
+                -----
+                For motif_length > 1, it's advisable to specify exclude_unobserved=True,
+                this avoids unnecessary calculations.
+                """
+
         probs = self.probs_per_seq(
             motif_length=motif_length,
             include_ambiguity=include_ambiguity,

diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py
@@ -98,6 +98,42 @@ def __str__(self):
         """__str__ returns self._seq unmodified."""
         return self._seq
 
+    def annotate_matches_to(self, pattern, annot_type, name, allow_multiple=False):
+        """
+        Adds an annotation at the specified pattern in a sequence.
+        The pattern allows for IUPAC ambiguities,
+        as they are converted to regex.
+
+        Parameters
+        ----------
+        pattern : string
+            The search string for which annotations are made.
+        annot_type : string
+            The type of the annotation (e.g. exon).
+        name : string
+            The name of the annotation.
+        allow_multiple : boolean
+            If True, checks for multiple occurences of the input pattern.
+
+        Returns
+        -------
+        Returns a list of Annotation instances.
+        """
+        pattern = self.moltype.to_regex(seq=pattern)
+        pos = [m.span() for m in re.finditer(pattern, self._seq)]
+        if not pos:
+            return []
+        annot = []
+        if allow_multiple:
+            for i in range(0, len(pos)):
+                annot.append(
+                    self.add_feature(annot_type, f"{name}:{i}", [pos[i]])
+                )
+        else:
+            pos = pos[:1]
+            annot.append(self.add_feature(annot_type, name, pos))
+        return annot
+
     def to_fasta(self, make_seqlabel=None, block_size=60):
         """Return string of self in FASTA format, no trailing newline
 

diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py
@@ -2571,6 +2571,34 @@ def test_add_from_ref_aln(self):
             ValueError, aln1.add_from_ref_aln, aln2_wrong_refseq
         )  # test wrong_refseq
 
+    def test_annotate_matches_to(self):
+        """Aligned. annotate_matches_to correctly delegates to sequence"""
+        aln = Alignment(dict(x="TTCCACTTCCGCTT"), moltype="dna")
+        seq = aln.named_seqs["x"]
+        pattern = "CCRC"
+        annot = seq.annotate_matches_to(
+            pattern=pattern,
+            annot_type="domain",
+            name="fred",
+            allow_multiple=True,
+        )
+        regular_expression = DNA.to_regex(seq=pattern)
+        for i in range(0, len(annot)):
+            fred = annot[i].get_slice()
+            self.assertEqual(
+                str(fred), re.search(regular_expression, str(fred)).group()
+            )
+        annot = seq.annotate_matches_to(
+            pattern=pattern,
+            annot_type="domain",
+            name="fred",
+            allow_multiple=False,
+        )
+        fred = annot[0].get_slice()
+        self.assertEqual(
+            str(fred), re.search(regular_expression, str(fred)).group()
+        )
+
     def test_deepcopy(self):
         """correctly deep copy aligned objects in an alignment"""
         path = "data/brca1_5.paml"

diff --git a/tests/test_core/test_sequence.py b/tests/test_core/test_sequence.py
@@ -50,6 +50,42 @@ class SequenceTests(TestCase):
     DNA = DnaSequence
     PROT = ProteinSequence
 
+    def test_annotate_matches_to(self):
+        """
+        annotate_matches_to method should attach
+        annotations correctly to a Sequence object, tested
+        for both multiple and singular annotations.
+        For Sequence objects of MolType
+        ASCII, annotate_matches_to should return an empty annotation.
+        """
+        seq = self.DNA("TTCCACTTCCGCTT", name="x")
+        pattern = "CCRC"
+        annot = seq.annotate_matches_to(
+            pattern=pattern,
+            annot_type="domain",
+            name="fred",
+            allow_multiple=True,
+        )
+        self.assertEqual([a.get_slice() for a in annot], ["CCAC", "CCGC"])
+        annot = seq.annotate_matches_to(
+            pattern=pattern,
+            annot_type="domain",
+            name="fred",
+            allow_multiple=False,
+        )
+        fred = annot[0].get_slice()[0:len(pattern)]
+        self.assertEqual(len(annot), 1)
+        self.assertEqual(str(fred), "CCAC")
+        seq = ASCII.make_seq(seq="TTCCACTTCCGCTT")
+        annot = seq.annotate_matches_to(
+            pattern=pattern,
+            annot_type="domain",
+            name="fred",
+            allow_multiple=False,
+        )
+        self.assertEqual(annot, [])
+
+
     def test_init_empty(self):
         """Sequence and subclasses should init correctly."""
         # NOTE: ModelSequences can't be initialized empty because it screws up