cmusphinx · dhdaines · Aug 21, 2022 · Aug 12, 2022 · Aug 12, 2022 · Aug 12, 2022
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -8,12 +8,14 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+      - name: Install
+        run: |
+          sudo apt-get install sox
       - name: Build
         run: |
           mkdir build
           (cd build && cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=install ..)
           (cd build && make)
-
       - name: Run tests
         run: |
           (cd build && make check)
@@ -24,6 +26,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Install
         run: |
+          sudo apt-get install sox
           python -m pip install --upgrade pip
           pip install -r requirements.dev.txt
           pip install .

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,6 +31,7 @@ endif()
 CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
 CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
 CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
+CHECK_INCLUDE_FILE(stdint.h HAVE_STDINT_H)
 CHECK_SYMBOL_EXISTS(snprintf stdio.h HAVE_SNPRINTF)
 CHECK_SYMBOL_EXISTS(popen stdio.h HAVE_POPEN)
 CHECK_TYPE_SIZE(long LONG)

diff --git a/LICENSE b/LICENSE
@@ -28,4 +28,59 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+
+WebRTC VAD code (in src/vad):
+
+Copyright (c) 2011, The WebRTC project authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Python WebRTC VAD code and test files (in cython and test/data/vad):
+
+The MIT License (MIT)
+
+Copyright (c) 2016 John Wiseman
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/config.h.in b/config.h.in
@@ -24,6 +24,12 @@
 /* Define if you have the <unistd.h> header file. */
 #cmakedefine HAVE_UNISTD_H
 
+/* Define if you have the <inttypes.h> header file. */
+#cmakedefine HAVE_INTTYPES_H
+
+/* Define if you have the <stdint.h> header file. */
+#cmakedefine HAVE_STDINT_H
+
 /* The size of `long', as computed by sizeof. */
 #cmakedefine SIZEOF_LONG @SIZEOF_LONG@
 

diff --git a/cython/_pocketsphinx.pxd b/cython/_pocketsphinx.pxd
@@ -20,6 +20,7 @@ cdef extern from "sphinxbase/err.h":
     ctypedef err_e err_lvl_t
     ctypedef void (*err_cb_f)(void* user_data, err_lvl_t lvl, const char *msg)
     void err_set_callback(err_cb_f callback, void *user_data)
+    const char *err_set_loglevel_str(const char *lvl)
 
 
 cdef extern from "sphinxbase/logmath.h":
@@ -405,3 +406,52 @@ cdef extern from "pocketsphinx/ps_search.h":
     int ps_set_allphone(ps_decoder_t *ps, const char *name, ngram_model_t *lm)
     int ps_set_allphone_file(ps_decoder_t *ps, const char *name, const char *path)
     int ps_set_align(ps_decoder_t *ps, const char *name, const char *words)
+
+cdef extern from "pocketsphinx/ps_vad.h":
+    ctypedef struct ps_vad_t:
+        pass
+    cdef enum ps_vad_mode_e:
+        PS_VAD_LOOSE,
+        PS_VAD_MEDIUM_LOOSE,
+        PS_VAD_MEDIUM_STRICT,
+        PS_VAD_STRICT
+    ctypedef ps_vad_mode_e ps_vad_mode_t
+    cdef enum ps_vad_class_e:
+        PS_VAD_ERROR,
+        PS_VAD_NOT_SPEECH,
+        PS_VAD_SPEECH
+    ctypedef ps_vad_class_e ps_vad_class_t
+    cdef int PS_VAD_DEFAULT_SAMPLE_RATE
+    cdef double PS_VAD_DEFAULT_FRAME_LENGTH
+
+    ps_vad_t *ps_vad_init(ps_vad_mode_t mode, int sample_rate, double frame_length)
+    int ps_vad_free(ps_vad_t *vad)
+    int ps_vad_set_input_params(ps_vad_t *vad, int sample_rate, double frame_length)
+    int ps_vad_sample_rate(ps_vad_t *vad)
+    size_t ps_vad_frame_size(ps_vad_t *vad)
+    double ps_vad_frame_length(ps_vad_t *vad)
+    ps_vad_class_t ps_vad_classify(ps_vad_t *vad, const short *frame)
+
+cdef extern from "pocketsphinx/ps_endpointer.h":
+    ctypedef struct ps_endpointer_t:
+        pass
+    cdef double PS_ENDPOINTER_DEFAULT_WINDOW
+    cdef double PS_ENDPOINTER_DEFAULT_RATIO
+    ps_endpointer_t *ps_endpointer_init(double window,
+                                        double ratio,
+                                        ps_vad_mode_t mode,
+                                        int sample_rate, double frame_length)
+    ps_endpointer_t *ps_endpointer_retain(ps_endpointer_t *ep)
+    int ps_endpointer_free(ps_endpointer_t *ep)
+    ps_vad_t *ps_endpointer_vad(ps_endpointer_t *ep)
+    size_t ps_endpointer_frame_size(ps_endpointer_t *ep)
+    int ps_endpointer_sample_rate(ps_endpointer_t *ep)
+    const short *ps_endpointer_process(ps_endpointer_t *ep,
+                                       const short *frame)
+    const short *ps_endpointer_end_stream(ps_endpointer_t *ep,
+                                          const short *frame,
+                                          size_t nsamp,
+                                          size_t *out_nsamp)
+    int ps_endpointer_in_speech(ps_endpointer_t *ep)
+    double ps_endpointer_speech_start(ps_endpointer_t *ep)
+    double ps_endpointer_speech_end(ps_endpointer_t *ep)
diff --git a/cython/_pocketsphinx.pyx b/cython/_pocketsphinx.pyx
@@ -109,7 +109,7 @@ cdef class Config:
         if config == NULL:
             return None
         return Config.create_from_ptr(config)
-        
+
     def __dealloc__(self):
         cmd_ln_free_r(self.cmd_ln)
 
@@ -662,7 +662,7 @@ cdef class Jsgf:
     """JSGF parser.
     """
     cdef jsgf_t *jsgf
-    
+
     def __init__(self, str path, Jsgf parent=None):
         cdef jsgf_t *cparent
         cpath = path.encode()
@@ -710,12 +710,12 @@ cdef class Lattice:
     def __dealloc__(self):
         if self.dag != NULL:
             ps_lattice_free(self.dag)
-    
+
     def write(self, str path):
         rv = ps_lattice_write(self.dag, path.encode("utf-8"))
         if rv < 0:
             raise RuntimeError("Failed to write lattice to %s" % path)
-    
+
     def write_htk(self, str path):
         rv = ps_lattice_write_htk(self.dag, path.encode("utf-8"))
         if rv < 0:
@@ -1124,7 +1124,7 @@ cdef class Decoder:
         if fsg == NULL:
             return None
         return FsgModel.create_from_ptr(fsg_model_retain(fsg))
-    
+
     def set_fsg(self, str name, FsgModel fsg):
         """Create a search module from an FSG.
 
@@ -1218,7 +1218,7 @@ cdef class Decoder:
         if rv < 0:
             return RuntimeError("Failed to set keyword search %s from phrase %s"
                                 % (name, keyphrase))
-    
+
     def set_allphone_file(self, str name, str lmfile = None):
         """Create a phoneme recognition search module.
 
@@ -1285,7 +1285,7 @@ cdef class Decoder:
             Config: Configuration parsed from `path`.
         """
         return Config.parse_file(path)
-    
+
     def load_dict(self, str dict_path, str fdict_path = None, str _format = None):
         """Load dictionary (and possibly noise dictionary) from a file.
 
@@ -1453,3 +1453,166 @@ cdef class Decoder:
         """
         return ps_get_n_frames(self.ps)
 
+cdef class Vad:
+    """Voice activity detection class.
+
+    Args:
+      mode(int): Aggressiveness of voice activity detction (0-3)
+      sample_rate(int): Sampling rate of input, default is 16000.
+                        Rates other than 8000, 16000, 32000, 48000
+                        are only approximately supported, see note
+                        in `frame_length`.  Outlandish sampling
+                        rates like 3924 and 115200 will raise a
+                        `ValueError`.
+      frame_length(float): Desired input frame length in seconds,
+                           default is 0.03.  The *actual* frame
+                           length may be different if an
+                           approximately supported sampling rate is
+                           requested.  You must *always* use the
+                           `frame_bytes` and `frame_length`
+                           attributes to determine the input size.
+
+    Attributes:
+      sample_rate(int): Sampling rate of input (default is 16000)
+      frame_bytes(int): Number of bytes in a frame accepted by `process`.
+      frame_length(float): Length of a frame (*may be different from
+                           the one requested in the constructor*!)
+
+    Raises:
+      ValueError: Invalid input parameter (see above).
+    """
+    cdef ps_vad_t *_vad
+    LOOSE = PS_VAD_LOOSE
+    MEDIUM_LOOSE = PS_VAD_MEDIUM_LOOSE
+    MEDIUM_STRICT = PS_VAD_MEDIUM_STRICT
+    STRICT = PS_VAD_STRICT
+    DEFAULT_SAMPLE_RATE = PS_VAD_DEFAULT_SAMPLE_RATE
+    DEFAULT_FRAME_LENGTH = PS_VAD_DEFAULT_FRAME_LENGTH
+
+    def __init__(self, mode=PS_VAD_LOOSE,
+                 sample_rate=PS_VAD_DEFAULT_SAMPLE_RATE,
+                 frame_length=PS_VAD_DEFAULT_FRAME_LENGTH):
+        self._vad = ps_vad_init(mode, sample_rate, frame_length)
+        if self._vad == NULL:
+            raise ValueError("Invalid VAD parameters")
+
+    def __dealloc__(self):
+        ps_vad_free(self._vad)
+
+    @property
+    def frame_bytes(self):
+        return ps_vad_frame_size(self._vad) * 2
+
+    @property
+    def frame_length(self):
+        return ps_vad_frame_length(self._vad)
+
+    @property
+    def sample_rate(self):
+        return ps_vad_sample_rate(self._vad)
+
+    def is_speech(self, frame, sample_rate=None):
+        """Classify a frame as speech or not.
+
+        Args:
+          frame(bytes): Buffer containing speech data (16-bit signed
+                        integers).  Must be of length `frame_bytes`
+                        (in bytes).
+        Returns:
+          (boolean) Classification as speech or not speech.
+        Raises:
+          IndexError: `buf` is of invalid size.
+          ValueError: Other internal VAD error.
+        """
+        cdef const unsigned char[:] cframe = frame
+        cdef Py_ssize_t n_samples = len(cframe) // 2
+        if len(cframe) != self.frame_bytes:
+            raise IndexError("Frame size must be %d bytes" % self.frame_bytes)
+        rv = ps_vad_classify(self._vad, <const short *>&cframe[0])
+        if rv < 0:
+            raise ValueError("VAD classification failed")
+        return rv == PS_VAD_SPEECH
+
+cdef class Endpointer:
+    """Simple endpointer using voice activity detection.
+    """
+    cdef ps_endpointer_t *_ep
+    DEFAULT_WINDOW = PS_ENDPOINTER_DEFAULT_WINDOW
+    DEFAULT_RATIO = PS_ENDPOINTER_DEFAULT_RATIO
+    def __init__(
+        self,
+        window=0.3,
+        ratio=0.9,
+        vad_mode=Vad.LOOSE,
+        sample_rate=Vad.DEFAULT_SAMPLE_RATE,
+        frame_length=Vad.DEFAULT_FRAME_LENGTH,
+    ):
+        self._ep = ps_endpointer_init(window, ratio,
+                                      vad_mode, sample_rate, frame_length)
+        if (self._ep == NULL):
+            raise ValueError("Invalid endpointer or VAD parameters")
+
+    @property
+    def frame_bytes(self):
+        return ps_endpointer_frame_size(self._ep) * 2
+
+    @property
+    def sample_rate(self):
+        return ps_endpointer_sample_rate(self._ep)
+
+    @property
+    def in_speech(self):
+        return ps_endpointer_in_speech(self._ep)
+
+    @property
+    def speech_start(self):
+        return ps_endpointer_speech_start(self._ep)
+
+    @property
+    def speech_end(self):
+        return ps_endpointer_speech_end(self._ep)
+
+    def process(self, frame):
+        """Read a frame of data and return speech if detected.
+
+        Args:
+          frame(bytes): Buffer containing speech data (16-bit signed
+                        integers).  Must be of length `frame_bytes`
+                        (in bytes).
+        Returns:
+          (bytes) Frame of speech data, or None if none detected.
+        Raises:
+          IndexError: `buf` is of invalid size.
+          ValueError: Other internal VAD error.
+        """
+        cdef const unsigned char[:] cframe = frame
+        cdef Py_ssize_t n_samples = len(cframe) // 2
+        cdef const short *outframe
+        if len(cframe) != self.frame_bytes:
+            raise IndexError("Frame size must be %d bytes" % self.frame_bytes)
+        outframe = ps_endpointer_process(self._ep,
+                                         <const short *>&cframe[0])
+        if outframe == NULL:
+            return None
+        return (<const unsigned char *>&outframe[0])[:n_samples * 2]
+
+    def end_stream(self, frame):
+        cdef const unsigned char[:] cframe = frame
+        cdef Py_ssize_t n_samples = len(cframe) // 2
+        cdef const short *outbuf
+        cdef size_t out_n_samples
+        if len(cframe) > self.frame_bytes:
+            raise IndexError("Frame size must be %d bytes or less" % self.frame_bytes)
+        outbuf = ps_endpointer_end_stream(self._ep,
+                                          <const short *>&cframe[0],
+                                          n_samples,
+                                          &out_n_samples)
+        if outbuf == NULL:
+            return None
+        return (<const unsigned char *>&outbuf[0])[:out_n_samples * 2]
+
+def set_loglevel(level):
+    cdef const char *prev_level
+    prev_level = err_set_loglevel_str(level.encode('utf-8'))
+    if prev_level == NULL:
+        raise ValueError("Invalid log level %s" % level)