Merge pull request #345 from dictation-toolbox/tts-improvements

Make SR engine text-to-speech functionality more flexible
dictation-toolbox · Mar 18, 2022 · 330cbca · 330cbca
2 parents 1a7814e + abbb361
commit 330cbca
Show file tree

Hide file tree

Showing 17 changed files with 557 additions and 55 deletions.
diff --git a/documentation/base_engine.txt b/documentation/base_engine.txt
@@ -10,8 +10,8 @@ Base engine classes
 EngineBase class
 ----------------------------------------------------------------------------
 
-The :class:`dragonfly.engines.engine_base.EngineBase` class forms the base
-class for this specific speech recognition engine classes.  It defines
+The :class:`dragonfly.engines.base.EngineBase` class forms the base
+class for the specific speech recognition engine classes.  It defines
 the stubs required and performs some of the logic necessary for
 Dragonfly to be able to interact with a speech recognition engine.
 

diff --git a/documentation/conf.py b/documentation/conf.py
@@ -91,6 +91,7 @@ def __getattr__(cls, name):
     "numpy",
     "pyperclip",
     "regex",
+    "natlink"
 }
 
 for module_name in mock_modules:

diff --git a/documentation/engines.txt b/documentation/engines.txt
@@ -8,6 +8,10 @@ Dragonfly supports multiple speech recognition engines as its backend.
 The *engines* sub-package implements the interface code for each
 supported engine.
 
+Also contained within this sub-package are a number of text-to-speech
+implementations.  These can be used independently of the speech recognition
+engines via the ``get_speaker()`` function.
+
 
 Main SR engine back-end interface
 ----------------------------------------------------------------------------
@@ -28,3 +32,14 @@ Engine back-ends
     kaldi_engine
     sphinx_engine
     text_engine
+
+Text-to-speech (speaker) back-ends
+----------------------------------------------------------------------------
+
+For more information on the available text-to-speech implementations, see
+the following sections:
+
+.. toctree::
+    :maxdepth: 2
+
+    speakers
diff --git a/dragonfly/__init__.py b/dragonfly/__init__.py
@@ -20,15 +20,15 @@
 
 import sys
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 from .config            import Config, Section, Item
 from .error             import DragonflyError, GrammarError
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 from .engines           import (get_engine, EngineError, MimicFailure,
-                                get_current_engine)
+                                get_current_engine, get_speaker)
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 from .grammar.grammar_base       import Grammar
 from .grammar.grammar_connection import ConnectionGrammar
 from .grammar.rule_base          import Rule
@@ -52,7 +52,7 @@
                                          register_ending_callback,
                                          register_post_recognition_callback)
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 
 from .actions           import (ActionBase, DynStrActionBase, ActionError,
                                 Repeat, Key, Text, Mouse, Paste, Pause,
@@ -65,25 +65,25 @@
     from .actions       import (KeyboardInput, MouseInput, HardwareInput,
                                 make_input_array, send_input_array)
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 
 if sys.platform.startswith("win"):
     from .windows.clipboard import Clipboard
 else:
     from .util              import Clipboard
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 
 from .windows.rectangle import Rectangle, unit
 from .windows.point     import Point
 from .windows           import Window, Monitor, monitors
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 from .language          import (Integer, IntegerRef, ShortIntegerRef,
                                 Digits, DigitsRef,
                                 Number, NumberRef)
 
-# --------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 from .accessibility     import (CursorPosition, TextQuery,
                                 get_accessibility_controller,
                                 get_stopping_accessibility_controller)

diff --git a/dragonfly/engines/__init__.py b/dragonfly/engines/__init__.py
@@ -25,12 +25,19 @@
 
 from .base import EngineBase, EngineError, MimicFailure
 
-
-# ---------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 
 _default_engine = None
 _engines_by_name = {}
 
+_default_speaker = None
+_speakers_by_name = {}
+
+_sapi5_names = ("sapi5shared", "sapi5inproc", "sapi5")
+_valid_engine_names = ("natlink", "kaldi", "sphinx", "text") + _sapi5_names
+_valid_speaker_names = ("natlink", "text", "espeak", "flite") + _sapi5_names
+
+
 
 def get_engine(name=None, **kwargs):
     """
@@ -97,7 +104,7 @@ def get_engine(name=None, **kwargs):
     if engine:
         return engine
 
-    # Check if we're on Windows. If  we're not on Windows, then we don't
+    # Check if we're on Windows.  If we're not on Windows, then we don't
     #  evaluate Windows-only engines like natlink.
     windows = os.name == 'nt'
 
@@ -129,8 +136,7 @@ def get_engine(name=None, **kwargs):
             if name:
                 raise EngineError(message)
 
-    sapi5_names = (None, "sapi5shared", "sapi5inproc", "sapi5")
-    if not engine and windows and name in sapi5_names:
+    if not engine and windows and name in (None,) + _sapi5_names:
         # Attempt to retrieve the sapi5 back-end.
         try:
             from .backend_sapi5 import is_engine_available
@@ -184,9 +190,7 @@ def get_engine(name=None, **kwargs):
     elif not name:
         raise EngineError("No usable engines found.")
     else:
-        valid_names = ["natlink", "kaldi", "sphinx", "sapi5shared",
-                       "sapi5inproc", "sapi5", "text"]
-        if name not in valid_names:
+        if name not in _valid_engine_names:
             raise EngineError("Requested engine %r is not a valid engine "
                               "name." % (name,))
         else:
@@ -222,7 +226,7 @@ def get_current_engine():
     return _default_engine
 
 
-# ---------------------------------------------------------------------------
+#---------------------------------------------------------------------------
 
 def register_engine_init(engine):
     """
@@ -238,3 +242,163 @@ def register_engine_init(engine):
         _default_engine = engine
     if engine and engine.name not in _engines_by_name:
         _engines_by_name[engine.name] = engine
+
+
+#---------------------------------------------------------------------------
+
+def get_speaker(name=None):
+    """
+        Get the text-to-speech (speaker) implementation.
+
+        This function will initialize and return a speaker instance instance
+        of the available speaker back-end.  If one has already been
+        initialized, it will be returned instead.
+
+        If no specific speaker back-end is requested and no speaker has
+        already been initialized, this function will initialize and return
+        an instance of the first available back-end in the following order:
+
+         =======================   =========================================
+         TTS speaker back-end      Speaker name string(s)
+         =======================   =========================================
+         1. SAPI 5                 ``"sapi5"``
+         2. Dragon/Natlink         ``"natlink"``
+         3. eSpeak                 ``"espeak"``
+         4. CMU Flite              ``"flite"``
+         5. Text (stdout)          ``"text"``
+         =======================   =========================================
+
+        The first two speaker back-ends are only available on Microsoft
+        Windows.  The second requires that Dragon NaturallySpeaking and
+        Natlink are installed on the system.
+
+        The third and fourth back-ends, eSpeak and CMU Flite, may be used on
+        most platforms.  These require that the appropriate command-line
+        programs are installed on the system.
+
+        The last back-end (text) is used as a fallback when no real speaker
+        implementation is available.  This back-end writes input text to
+        stdout, i.e., prints text to the console.
+
+        **Arguments**:
+
+        :param name: optional human-readable name of the speaker to return.
+        :type name: str
+        :rtype: SpeakerBase
+        :returns: speaker instance
+        :raises: EngineError
+    """
+    global _default_speaker, _speakers_by_name
+    log = logging.getLogger("speaker")
+
+    if name and name in _speakers_by_name:
+        speaker = _speakers_by_name[name]
+    elif not name and _default_speaker:
+        speaker = _default_speaker
+    else:
+        speaker = None
+    if speaker:
+        return speaker
+
+    windows = os.name == 'nt'
+    if not speaker and windows and name in (None,) + _sapi5_names:
+        # Check if the sapi5 back-end is available.
+        try:
+            from .backend_sapi5          import is_engine_available
+            from .backend_sapi5.speaker  import Sapi5Speaker
+            if is_engine_available(name):
+                speaker = Sapi5Speaker()
+        except Exception as e:
+            message = ("Exception while initializing sapi5 speaker:"
+                       " %s" % (e,))
+            log.warning(message)
+            if name:
+                raise EngineError(message)
+
+    if not speaker and windows and name in (None, "natlink"):
+        # Check if the natlink back-end is available.
+        try:
+            from .backend_natlink          import is_engine_available
+            from .backend_natlink.speaker  import NatlinkSpeaker
+            if is_engine_available():
+                speaker = NatlinkSpeaker()
+        except Exception as e:
+            message = ("Exception while initializing natlink speaker:"
+                       " %s" % (e,))
+            log.warning(message)
+            if name:
+                raise EngineError(message)
+
+    if not speaker and name in (None, "espeak"):
+        # Check if eSpeak is available.
+        try:
+            from .base.speaker_stdin import EspeakSpeaker
+            if EspeakSpeaker.is_available():
+                speaker = EspeakSpeaker()
+        except Exception as e:
+            message = ("Exception while initializing eSpeak speaker:"
+                       " %s" % (e,))
+            log.warning(message)
+            if name:
+                raise EngineError(message)
+
+    if not speaker and name in (None, "flite"):
+        # Check if CMU Flite is available.
+        try:
+            from .base.speaker_stdin import FliteSpeaker
+            if FliteSpeaker.is_available():
+                speaker = FliteSpeaker()
+        except Exception as e:
+            message = ("Exception while initializing Flite speaker:"
+                       " %s" % (e,))
+            log.warning(message)
+            if name:
+                raise EngineError(message)
+
+    if not speaker and name in (None, "text"):
+        # Check if the text back-end is available.
+        try:
+            from .backend_text          import is_engine_available
+            from .backend_text.speaker  import TextSpeaker
+            if is_engine_available():
+                speaker = TextSpeaker()
+        except Exception as e:
+            message = ("Exception while initializing text speaker:"
+                       " %s" % (e,))
+            log.warning(message)
+            if name:
+                raise EngineError(message)
+
+    # Return the speaker instance, if one has been initialized.  Log a
+    #  message about which SR speaker back-end was used.
+    if speaker:
+        message = "Initialized %r speaker: %r." % (speaker.name, speaker)
+        log.info(message)
+        return speaker
+    elif not name:
+        raise EngineError("No usable speakers found.")
+    else:
+        if name not in _valid_speaker_names:
+            raise EngineError("Requested speaker %r is not a valid speaker "
+                              "name." % (name,))
+        else:
+            raise EngineError("Requested speaker %r not available."
+                              % (name,))
+
+
+#---------------------------------------------------------------------------
+
+def register_speaker_init(speaker):
+    """
+        Register initialization of a speaker.
+
+        This function sets the default speaker to the first speaker
+        initialized.
+
+    """
+
+    global _default_speaker, _speakers_by_name
+    if not _default_speaker:
+        _default_speaker = speaker
+    if speaker and speaker.name not in _speakers_by_name:
+        _speakers_by_name[speaker.name] = speaker
diff --git a/dragonfly/engines/backend_kaldi/engine.py b/dragonfly/engines/backend_kaldi/engine.py
@@ -30,6 +30,7 @@
 from six.moves             import zip
 from kaldi_active_grammar  import KaldiError, KaldiRule
 
+import dragonfly.engines
 from dragonfly.windows.window  import Window
 from dragonfly.engines.base    import (EngineBase,
                                        EngineError,
@@ -320,9 +321,7 @@ def mimic(self, words):
 
     def speak(self, text):
         """ Speak the given *text* using text-to-speech. """
-        # FIXME
-        self._log.warning("Text-to-speech is not implemented for this engine; printing text instead.")
-        print_(text)
+        dragonfly.engines.get_speaker().speak(text)
 
     def _get_language(self):
         return "en"

diff --git a/dragonfly/engines/backend_natlink/engine.py b/dragonfly/engines/backend_natlink/engine.py
@@ -38,11 +38,12 @@
 from locale     import getpreferredencoding
 from threading  import Thread, Event
 
-from six import text_type, binary_type, string_types, PY2
+from six        import text_type, binary_type, string_types, PY2
 
 
 from dragonfly.engines.base  import (EngineBase, EngineError, MimicFailure,
                                     GrammarWrapperBase)
+from dragonfly.engines.backend_natlink.speaker    import NatlinkSpeaker
 from dragonfly.engines.backend_natlink.compiler   import NatlinkCompiler
 from dragonfly.engines.backend_natlink.dictation  import \
     NatlinkDictationContainer
@@ -140,6 +141,7 @@ def __init__(self, retain_dir=None):
         self._timer_manager = NatlinkTimerManager(0.02, self)
         self._timer_thread = None
         self._retain_dir = None
+        self._speaker = NatlinkSpeaker()
         try:
             self.set_retain_directory(retain_dir)
         except EngineError as err:
@@ -351,16 +353,7 @@ def mimic(self, words):
 
     def speak(self, text):
         """ Speak the given *text* using text-to-speech. """
-        # Store the current mic state.
-        mic_state = self.natlink.getMicState()
-
-        # Say the text.
-        self.natlink.execScript('TTSPlayString "%s"' % text)
-
-        # Restore the previous mic state if necessary.
-        # This is to have consistent behaviour for each version of Dragon.
-        if mic_state != self.natlink.getMicState():
-            self.natlink.setMicState(mic_state)
+        self._speaker.speak(text)
 
     def _get_language(self):
         # Get a Windows language identifier from Dragon.