Skip to content

Commit

Permalink
Merge pull request #365 from dictation-toolbox/fix/dictation-differen…
Browse files Browse the repository at this point in the history
…tiation

Improve the dictation element recognition decoding procedure
  • Loading branch information
drmfinlay committed Mar 7, 2022
2 parents 5be8521 + 3981f8d commit 973a5bb
Show file tree
Hide file tree
Showing 16 changed files with 353 additions and 541 deletions.
45 changes: 10 additions & 35 deletions documentation/sphinx_engine.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,41 +267,16 @@ and dragonfly's :class:`Dictation` functionality.
Dictation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Mixing free-form dictation with grammar rules is difficult to reproduce with
the CMU Sphinx engines. It is either dictation or grammar rules, not both.
For this reason, the CMU Pocket Sphinx SR engine supports speaking free-form
dictation, but only on its own. Parts of rules that have required
combinations with :class:`Dictation` and other basic Dragonfly elements such
as :class:`Literal`, :class:`RuleRef` and :class:`ListRef` will not be
recognised properly using this SR engine via speaking.

You can use the :meth:`engine.mimic` method, the :class:`Mimic` action or
the :class:`Playback` action to match :class:`Dictation` elements by using
all uppercase words. For example:


.. code:: Python

from dragonfly import Grammar, CompoundRule, Dictation, get_engine

engine = get_engine("sphinx")
engine.config.START_ASLEEP = False


class MyRule(CompoundRule):
spec = "hello <text>"
extras = [Dictation("text")]
def _process_recognition(self, node, extras):
# "world" will be printed in lowercase to be consistent with
# normal output from CMU Pocket Sphinx.
print(extras["text"])


grammar = Grammar("dictation grammar")
grammar.add_rule(MyRule())
grammar.load()

engine.mimic("hello WORLD")
Mixing free-form dictation with grammar rules is difficult with the CMU
Sphinx decoders. It is either dictation or grammar rules, not both. For this
reason, Dragonfly's CMU Pocket Sphinx SR engine supports speaking free-form
dictation, but only on its own.

Parts of rules that have required combinations with :class:`Dictation` and
other basic Dragonfly elements such as :class:`Literal`, :class:`RuleRef` and
:class:`ListRef` will not be recognised properly using this SR engine via
speaking. They can, however, be recognised via the :meth:`engine.mimic`
method, the :class:`Mimic` action or the :class:`Playback` action.

.. note::

Expand Down
8 changes: 3 additions & 5 deletions documentation/text_engine.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@ Note that :meth:`dragonfly.engines.get_engine` called without ``"text"``
will **never** initialise the text-input engine. This is because real speech
recognition backends should be returned from the function by default.

All dragonfly elements and rule classes should be supported. Use all
uppercase words to mimic input for :class:`Dictation` elements, e.g.
`"find SOME TEXT"` to match the dragonfly spec `"find <text>"`.
`executable`, `title`, and `handle` keyword arguments may optionally be
passed to :meth:`engine.mimic` to simulate a particular foreground window.
All dragonfly elements and rule classes should be supported. `executable`,
`title`, and `handle` keyword arguments may optionally be passed to
:meth:`engine.mimic` to simulate a particular foreground window.


Engine Configuration
Expand Down
38 changes: 17 additions & 21 deletions dragonfly/engines/backend_kaldi/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from six.moves import zip
from kaldi_active_grammar import KaldiError, KaldiRule

from dragonfly.grammar.state import State
from dragonfly.windows.window import Window
from dragonfly.engines.base import (EngineBase,
EngineError,
Expand Down Expand Up @@ -612,7 +611,7 @@ def _parse_recognition(self, output, mimic=False):
if self._log.isEnabledFor(12):
try:
self._log.log(12, "Alignment (word,time,length): %s", self._decoder.get_word_align(output))
except KaldiError as e:
except KaldiError:
self._log.warning("Exception logging word alignment")

else:
Expand Down Expand Up @@ -697,6 +696,20 @@ def __init__(self, grammar, kaldi_rule_by_rule_dict, engine,
def phrase_start_callback(self, executable, title, handle):
self.grammar.process_begin(executable, title, handle)

def _decode_grammar_rules(self, state, words, results, *args):
rule = args[0]
state.initialize_decoding()
for result in rule.decode(state):
if state.finished():
root = state.build_parse_tree()
notify_args = (words, rule, root, results)
self.recobs_manager.notify_recognition(*notify_args)
with debug_timer(self.engine._log.debug, "rule execution time"):
rule.process_recognition(root)
self.recobs_manager.notify_post_recognition(*notify_args)
return True
return False

def recognition_callback(self, recognition):
words = recognition.words
rule = recognition.kaldi_rule.parent_rule
Expand All @@ -709,25 +722,8 @@ def recognition_callback(self, recognition):
words_rules = tuple((word, 0 if not is_dictation else 1)
for (word, is_dictation) in zip(words, words_are_dictation_mask))

# Attempt to parse the recognition
func = getattr(self.grammar, "process_recognition", None)
if func:
if not self._process_grammar_callback(func, words=words,
results=recognition):
# Return early if the method didn't return True or equiv.
return

state = State(words_rules, rule_names, self.engine)
state.initialize_decoding()
for result in rule.decode(state):
if state.finished():
root = state.build_parse_tree()
notify_args = (words, rule, root, recognition)
self.recobs_manager.notify_recognition(*notify_args)
with debug_timer(self.engine._log.debug, "rule execution time"):
rule.process_recognition(root)
self.recobs_manager.notify_post_recognition(*notify_args)
return
# Attempt to process the recognition.
if self.process_results(words_rules, rule_names, recognition, rule): return

except Exception as e:
self.engine._log.error("Grammar %s: exception: %s" % (self.grammar._name, e), exc_info=True)
Expand Down
2 changes: 1 addition & 1 deletion dragonfly/engines/backend_natlink/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def _compile_rule_chunk(self, chunk_id):
return header + definition_data

def _get_rule_names(self):
return tuple([None] + self._rules)
return tuple(self._rules)

rule_names = property(_get_rule_names,
doc="Read-only access to the list of rule names.")
Expand Down
135 changes: 38 additions & 97 deletions dragonfly/engines/backend_natlink/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

from six import text_type, binary_type, string_types, PY2

from dragonfly.grammar import elements as elements_, state as state_

from dragonfly.engines.base import (EngineBase, EngineError, MimicFailure,
GrammarWrapperBase)
from dragonfly.engines.backend_natlink.compiler import NatlinkCompiler
Expand Down Expand Up @@ -211,7 +211,7 @@ def _load_grammar(self, grammar):

c = NatlinkCompiler()
(compiled_grammar, rule_names) = c.compile_grammar(grammar)
grammar._rule_names = rule_names
wrapper.rule_names = rule_names

all_results = (hasattr(grammar, "process_recognition_other")
or hasattr(grammar, "process_recognition_failure"))
Expand Down Expand Up @@ -304,9 +304,6 @@ def update_list(self, lst, grammar):
grammar_object.emptyList(n)
[f(n, word) for word in lst.get_list_items()]

# Clear grammar wrapper word sets so they get recalculated.
wrapper.rule_words_map.clear()

#-----------------------------------------------------------------------
# Miscellaneous methods.

Expand Down Expand Up @@ -423,102 +420,61 @@ def set_retain_directory(self, retain_dir):

class GrammarWrapper(GrammarWrapperBase):

# Enable guessing at which words were dictated, since DNS does not
# always report accurate rule IDs.
_dictated_word_guesses_enabled = True

def __init__(self, grammar, grammar_object, engine, recobs_manager):
GrammarWrapperBase.__init__(self, grammar, engine, recobs_manager)
self.grammar_object = grammar_object
self.rule_words_map = {}

def get_rule_words(self, rule):
# Return a set containing any words used in this rule or in any
# referenced rules or lists. Store the set for each rule as an
# optimization.
if rule.name in self.rule_words_map:
return self.rule_words_map[rule.name]

words = set()
for element in self.grammar._get_element_list(rule):
if isinstance(element, elements_.Literal):
# Only get the required first word.
literal_words = element.words_ext
if literal_words:
words.add(literal_words[0])
elif isinstance(element, elements_.RuleRef):
words.update(self.get_rule_words(element.rule))
elif isinstance(element, elements_.ListRef):
for string in element.list.get_list_items():
# Only get the required first word.
list_item_words = string.split()
if list_item_words:
words.add(list_item_words[0])

self.rule_words_map[rule.name] = words
return words
self.rule_names = None

def begin_callback(self, module_info):
executable, title, handle = tuple(map_word(word)
for word in module_info)
self.grammar.process_begin(executable, title, handle)

def _process_rules(self, words, words_rules, results,
manual_rule_ids):
# Iterates through this grammar's rules, attempting
# to decode each. If successful, call that rule's
# method for processing the recognition and return.
s = state_.State(words_rules, self.grammar._rule_names,
self.engine)
for r in self.grammar._rules:
if not (r.active and r.exported): continue

# Set dictation words manually if DNS didn't report a difference
# between command and dictation words. A word is set as
# dictation if it isn't a reported DNS dictation word and isn't
# a word in the current top-level rule or any referenced rules.
if manual_rule_ids:
rule_words = self.get_rule_words(r)
words_rules2 = tuple(
(w, 1000000) if r < 1000000 and w not in rule_words
else (w, r)
for w, r in words_rules
)
s = state_.State(words_rules2, self.grammar._rule_names,
self.engine)
s.initialize_decoding()
for result in r.decode(s):
if s.finished():
self._retain_audio(words, results, r.name)
root = s.build_parse_tree()
def _decode_grammar_rules(self, state, words, results, *args):
# Iterate through this grammar's rules, attempting to decode each.
# If successful, call that rule's method for processing the
# recognition and return.
for rule in self.grammar.rules:
if not (rule.active and rule.exported): continue
state.initialize_decoding()
for _ in rule.decode(state):
if state.finished():
self._retain_audio(words, results, rule.name)
root = state.build_parse_tree()

# Notify observers using the manager *before*
# processing.
notify_args = (words, r, root, results)
self.recobs_manager.notify_recognition(*notify_args)

r.process_recognition(root)

# Notify observers using the manager *after*
# processing.
# TODO Use words="other" instead, with a special
# recobs grammar wrapper at index 0.
notify_args = (words, rule, root, results)
self.recobs_manager.notify_recognition(
*notify_args
)
try:
rule.process_recognition(root)
except Exception as e:
self._log.exception("Failed to process rule "
"'%s': %s" % (rule.name, e))
self.recobs_manager.notify_post_recognition(
*notify_args
)
return True

return False

def results_callback(self, words, results):
NatlinkEngine._log.debug("Grammar %s: received recognition %r."
% (self.grammar._name, words))
self._log.debug("Grammar %s: received recognition %r."
% (self.grammar.name, words))

if words == "other":
func = getattr(self.grammar, "process_recognition_other", None)
self._process_grammar_callback(
func, words=tuple(map_word(w) for w in results.getWords(0)),
results=results
)
result_words = tuple(map_word(w) for w in results.getWords(0))
self.process_special_results(words, result_words, results)
return
elif words == "reject":
func = getattr(self.grammar, "process_recognition_failure",
None)
self._process_grammar_callback(func, results=results)
self.process_special_results(words, None, results)
return

# If the words argument was not "other" or "reject", then
Expand All @@ -527,28 +483,13 @@ def results_callback(self, words, results):
words_rules = tuple((map_word(w), r) for w, r in words)
words = tuple(w for w, r in words_rules)

# Call the grammar's general process_recognition method, if present.
func = getattr(self.grammar, "process_recognition", None)
if func:
if not self._process_grammar_callback(func, words=words,
results=results):
# Return early if the method didn't return True or equiv.
return

# Attempt to decode each grammar rule and process the recognition if
# successful.
if self._process_rules(words, words_rules, results, False):
return

# Try again. This time try to set words as dictation words where
# appropriate.
if self._process_rules(words, words_rules, results, True):
# Process this recognition.
if self.process_results(words_rules, self.rule_names, results):
return

# Failed to decode recognition.
NatlinkEngine._log.warning("Grammar %s: failed to decode"
" recognition %r."
% (self.grammar._name, words))
self._log.error("Grammar %s: failed to decode recognition %r."
% (self.grammar._name, words))

def _retain_audio(self, words, results, rule_name):
# Only write audio data and metadata if the directory exists.
Expand Down

0 comments on commit 973a5bb

Please sign in to comment.