Skip to content

Commit

Permalink
Merge pull request #472 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
ENH: genetic code gets a to_regex method
  • Loading branch information
GavinHuttley committed Jan 7, 2020
2 parents 3002d42 + a3406d1 commit f88dc89
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 1 deletion.
29 changes: 28 additions & 1 deletion src/cogent3/core/genetic_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _simple_rc(seq):
_bases = "TCAG"


class GeneticCode(object):
class GeneticCode:
"""Holds codon to amino acid mapping, and vice versa.
Usage: gc = GeneticCode(code_sequence)
Expand Down Expand Up @@ -303,6 +303,33 @@ def changes(self, other):
changes[codon] = old + new
return changes

def to_regex(self, seq):
"""returns a regex pattern with an amino acid expanded to its codon set
Parameters
----------
seq
a Sequence or string of amino acids
"""
from .moltype import PROTEIN_WITH_STOP_ambiguities as ambigs

seq = list(str(seq))
mappings = []
for aa in seq:
if aa in ambigs:
aa = ambigs[aa]
else:
aa = [aa]

codons = []
for a in aa:
codons.extend(self[a])

# we create a regex non-capturing group for each amino acid
mappings.append(f"(?:{'|'.join(codons)})")

return "".join(mappings)


NcbiGeneticCodeData = [
GeneticCode(*data)
Expand Down
17 changes: 17 additions & 0 deletions tests/test_core/test_genetic_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,23 @@ def test_to_table(self):
# check there are 3 headers
self.assertEqual(table.shape[1], 3)

def test_to_regex(self):
"""creates a regex from aa seq to match a DNA sequence"""
import re
from cogent3 import make_seq

dna = "ACCGAACAGGGC"
aa = "TEQG"
pattern = DEFAULT.to_regex(aa)
self.assertTrue("".join(re.findall(pattern, dna)) == dna)
# note that Z is Q or E
aa = "TZQG"
pattern = DEFAULT.to_regex(aa)
self.assertTrue("".join(re.findall(pattern, dna)) == dna)
aa = make_seq(aa, moltype="protein")
pattern = DEFAULT.to_regex(aa)
self.assertTrue("".join(re.findall(pattern, dna)) == dna)


# Run tests if called from command line
if __name__ == "__main__":
Expand Down

0 comments on commit f88dc89

Please sign in to comment.