Support stop with partial recognized speech (#60)

compulim · Aug 18, 2019 · 9bc0772 · 9bc0772
1 parent 2e326b4
commit 9bc0772
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 46 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - `*`: Fix [#47](https://github.com/compulim/web-speech-cognitive-services/issues/47), add `enableTelemetry` option for disabling collecting telemetry data in Speech SDK, in PR [#51](https://github.com/compulim/web-speech-cognitive-services/pull/51)
 - `*`: Fix [#53](https://github.com/compulim/web-speech-cognitive-services/issues/53), added ESLint, in PR [#54](https://github.com/compulim/web-speech-cognitive-services/pull/54)
 - Speech synthesis: Fix [#39](https://github.com/compulim/web-speech-cognitive-services/issues/39), support SSML utterance, in PR [#57](https://github.com/compulim/web-speech-cognitive-services/pull/57)
+- Speech recognition: Fix [#59](https://github.com/compulim/web-speech-cognitive-services/issues/59), support `stop()` function by finalizing partial speech, in PR [#60](https://github.com/compulim/web-speech-cognitive-services/pull/60)
 
 ### Changed
 

diff --git a/...rc/SpeechServices/SpeechToText/__snapshots__/createSpeechRecognitionPonyfill.test.js.snap b/...rc/SpeechServices/SpeechToText/__snapshots__/createSpeechRecognitionPonyfill.test.js.snap
@@ -40,7 +40,7 @@ Array [
 ]
 `;
 
-exports[`SpeechRecognition in continuous mode stop after recognized 2 speeches 1`] = `
+exports[`SpeechRecognition in continuous mode stop after recognized 1 speech and 1 ongoing 1`] = `
 Array [
   "cognitiveservices:audioSourceReady",
   "webspeech:start",
@@ -52,9 +52,9 @@ Array [
   "webspeech:result ['hello']",
   "cognitiveservices:recognized",
   "webspeech:result ['Hello.' (isFinal)]",
+  "cognitiveservices:stop",
   "cognitiveservices:recognized",
   "webspeech:result ['Hello.' (isFinal), 'World.' (isFinal)]",
-  "cognitiveservices:stop",
   "cognitiveservices:audioSourceOff",
   "webspeech:speechend",
   "webspeech:soundend",
@@ -97,6 +97,7 @@ Array [
   "webspeech:speechend",
   "webspeech:soundend",
   "webspeech:audioend",
+  "webspeech:error { error: 'no-speech' }",
   "webspeech:end",
 ]
 `;

diff --git a/packages/component/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.js b/packages/component/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.js
@@ -108,6 +108,38 @@ export default ({
     return {};
   }
 
+  let onAudibleChunk;
+  let muted;
+
+  // We modify "attach" function and detect when audible chunk is read.
+  // We will only modify "attach" function once.
+  audioConfig.attach = improviseAsync(
+    audioConfig.attach.bind(audioConfig),
+    reader => ({
+      ...reader,
+      read: improviseAsync(
+        reader.read.bind(reader),
+        chunk => {
+          // The magic number 150 is measured by:
+          // 1. Set microphone volume to 0
+          // 2. Observe the amplitude (100-110) for the first few chunks
+          //    (This is short static caught when turning on the microphone)
+          // 3. Set the number a bit higher than the observation
+
+          if (averageAmplitude(chunk.buffer) > 150) {
+            onAudibleChunk && onAudibleChunk();
+          }
+
+          if (muted) {
+            return { buffer: new ArrayBuffer(0), isEnd: true, timeReceived: Date.now() };
+          }
+
+          return chunk;
+        }
+      )
+    })
+  );
+
   SpeechRecognizer.enableTelemetry(enableTelemetry);
 
   class SpeechRecognition extends EventTarget {
@@ -183,34 +215,12 @@ export default ({
       let speechStarted;
       let stopping;
 
-      // We modify "attach" function and detect when the first chunk is read.
-      recognizer.audioConfig.attach = improviseAsync(
-        recognizer.audioConfig.attach.bind(recognizer.audioConfig),
-        reader => {
-          let firstAudibleChunkEmitted;
-
-          return {
-            ...reader,
-            read: improviseAsync(
-              reader.read.bind(reader),
-              chunk => {
-                // The magic number 150 is measured by:
-                // 1. Set microphone volume to 0
-                // 2. Observe the amplitude (100-110) for the first few chunks
-                //    (This is short static caught when turning on the microphone)
-                // 3. Set the number a bit higher than the observation
-
-                if (!firstAudibleChunkEmitted && averageAmplitude(chunk.buffer) > 150) {
-                  queue.push({ firstAudibleChunk: {} });
-                  firstAudibleChunkEmitted = true;
-                }
-
-                return chunk;
-              }
-            )
-          };
-        }
-      );
+      muted = false;
+
+      onAudibleChunk = () => {
+        queue.push({ firstAudibleChunk: {} });
+        onAudibleChunk = null;
+      };
 
       const { detach: detachAudioConfigEvent } = recognizer.audioConfig.events.attach(event => {
         const { name } = event;
@@ -346,16 +356,16 @@ export default ({
               error: 'aborted',
               type: 'error'
             };
-          } else if (finalizedResults.length) {
-            finalEvent = {
-              results: finalizedResults,
-              type: 'result'
-            };
+          } else {
+            // When we set to mute and { isEnd: true }, Speech Services will send us "recognized" event.
+            muted = true;
           }
 
           stopping = true;
 
-          await cognitiveServicesAsyncToPromise(recognizer.stopContinuousRecognitionAsync.bind(recognizer))();
+          if (abort) {
+            await cognitiveServicesAsyncToPromise(recognizer.stopContinuousRecognitionAsync.bind(recognizer))();
+          }
         } else if (audioSourceReady) {
           this.dispatchEvent(new SpeechRecognitionEvent('audiostart'));
 
@@ -417,12 +427,12 @@ export default ({
               }));
             }
 
-            if (!this.continuous) {
-              finalEvent = {
-                results: finalizedResults,
-                type: 'result'
-              };
+            finalEvent = {
+              results: finalizedResults,
+              type: 'result'
+            };
 
+            if (!this.continuous) {
               recognizer.stopContinuousRecognitionAsync();
             }
           } else if (recognizing) {
@@ -442,6 +452,8 @@ export default ({
         }
       }
 
+      onAudibleChunk = null;
+
       if (speechStarted) {
         this.dispatchEvent(new SpeechRecognitionEvent('speechend'));
       }

diff --git a/packages/component/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.test.js b/packages/component/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.test.js
@@ -715,6 +715,7 @@ describe('SpeechRecognition', () => {
       // webspeech:speechend
       // webspeech:soundend
       // webspeech:audioend
+      // webspeech:error { error: 'no-speech' }
       // webspeech:end
 
       await endEventEmitted;
@@ -765,7 +766,7 @@ describe('SpeechRecognition', () => {
       expect(toSnapshot(events)).toMatchSnapshot();
     });
 
-    test('stop after recognized 2 speeches', async () => {
+    test('stop after recognized 1 speech and 1 ongoing', async () => {
       speechRecognition.start();
       speechRecognition.continuous = true;
       speechRecognition.interimResults = true;
@@ -796,15 +797,15 @@ describe('SpeechRecognition', () => {
       // cognitiveservices:recognized
       // webspeech:result ['Hello.' (isFinal)]
 
+      speechRecognition.stop();
+
+      // cognitiveservices:stop
+
       recognizer.recognized(this, createRecognizedEvent('World.'));
 
       // cognitiveservices:recognized
       // webspeech:result ['Hello.' (isFinal), 'World.' (isFinal)]
 
-      speechRecognition.stop();
-
-      // cognitiveservices:stop
-
       recognizer.audioConfig.emitEvent('AudioSourceOffEvent');
 
       // cognitiveservices:audioSourceOff