feat: Implement echo cancellation and improve audio handling

- Add echo cancellation to reduce false triggers from TTS output - Separate input (microphone) and output (TTS) audio paths - Refactor audio utilities for better modularity and reusability Changes: - Add setupEchoCancellation function in audio-utils.js - Implement getProcessedAudioStream for echo-cancelled audio - Modify startRecording in InteractMode.vue to use echo cancellation - Update playAudio and addToAudioQueue for improved audio management - Ensure AudioContext persistence across component lifecycle This commit aim to improve the voice interaction experience by reducing self-triggering and enhancing audio playback control.
fingerthief · Jun 23, 2024 · c62835d · c62835d
1 parent 28d3b69
commit c62835d
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 62 deletions.
diff --git a/src/components/controls/InteractMode.vue b/src/components/controls/InteractMode.vue
@@ -29,23 +29,32 @@
 import { ref, onMounted, onUnmounted, nextTick } from 'vue';
 import { fetchSTTResponse } from '@/libs/api-access/gpt-api-access';
 import { useWhisper, isInteractModeOpen } from '@/libs/state-management/state';
-import { getSupportedMimeType, checkMicrophoneAvailability, checkWebSpeechAPI, startMediaRecorder, playAudio, downloadAudio } from '@/libs/utils/audio-utils';
+import { 
+  getSupportedMimeType, 
+  checkMicrophoneAvailability, 
+  checkWebSpeechAPI, 
+  startMediaRecorder, 
+  playAudio, 
+  downloadAudio, 
+  setupEchoCancellation, 
+  getProcessedAudioStream
+} from '@/libs/utils/audio-utils';
 
 const emit = defineEmits(['recognized-sentence', 'close-interact-mode']);
 
 const isRecording = ref(false);
 const isLoading = ref(false);
 const errorMessage = ref(null);
-const audioContext = ref(null);
 const analyser = ref(null);
 const dataArray = ref(null);
 const mediaRecorder = ref(null);
 const recognizedSentences = ref([]);
 let currentAudioChunks = [];
-let vadStream = null;
 const wavePath = ref('');
 const state = ref('listening');
 const mimeType = getSupportedMimeType();
+const recognition = ref(null);
+let audioContextRef = null;
 
 const stopRecording = () => {
   if (mediaRecorder.value && mediaRecorder.value.state !== 'inactive') {
@@ -56,11 +65,6 @@ const stopRecording = () => {
     recognition.value.stop();
   }
 
-  if (vadStream) {
-    vadStream.getTracks().forEach(track => track.stop());
-    vadStream = null;
-  }
-
   isRecording.value = false;
 };
 
@@ -143,18 +147,25 @@ const startRecording = async () => {
   }
 
   try {
-    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-    audioContext.value = new (window.AudioContext || window.webkitAudioContext)();
-    analyser.value = audioContext.value.createAnalyser();
-    const source = audioContext.value.createMediaStreamSource(stream);
+    const { audioContext, echoCanceller, stream } = await setupEchoCancellation();
+    audioContextRef = audioContext;
+    const processedStream = getProcessedAudioStream();
+    
+    analyser.value = audioContext.createAnalyser();
+    const source = audioContext.createMediaStreamSource(processedStream);
     source.connect(analyser.value);
     analyser.value.fftSize = 2048;
-    const bufferLength = analyser.value.frequencyBinCount;
-    dataArray.value = new Uint8Array(bufferLength);
-    vadStream = stream;
+    dataArray.value = new Uint8Array(analyser.value.frequencyBinCount);
 
     isRecording.value = true;
 
+    recognition.value = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
+    recognition.value.continuous = true;
+    recognition.value.interimResults = false;
+    recognition.value.lang = 'en-US';
+
+    setupRecognitionEventListeners(processedStream);
+
     recognition.value.start();
     state.value = 'listening';
 
@@ -163,23 +174,14 @@ const startRecording = async () => {
 
     isLoading.value = false;
   } catch (error) {
+    console.error('Error in startRecording:', error);
     errorMessage.value = 'Error starting recording or speech recognition.';
     state.value = 'error';
     isLoading.value = false;
   }
 };
 
-const recognition = ref(null);
-
-onMounted(async () => {
-  if (!checkWebSpeechAPI()) {
-    errorMessage.value = 'Web Speech API is not supported in this browser.';
-    state.value = 'error';
-    return;
-  }
-
-  recognition.value = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
-
+const setupRecognitionEventListeners = (stream) => {
   recognition.value.onspeechstart = () => {
     drawAudioWaveform();
     console.log("Speech has been detected");
@@ -188,9 +190,9 @@ onMounted(async () => {
       return;
     }
 
-    if (!mediaRecorder || mediaRecorder.state === 'inactive') {
-      startMediaRecorder(stream, mediaRecorder, currentAudioChunks, mimeType, recognizedSentences);
-      mediaRecorder.start();
+    if (!mediaRecorder.value || mediaRecorder.value.state === 'inactive') {
+      mediaRecorder.value = startMediaRecorder(stream, currentAudioChunks, mimeType, recognizedSentences);
+      mediaRecorder.value.start();
     }
   };
 
@@ -202,13 +204,11 @@ onMounted(async () => {
       return;
     }
 
-    mediaRecorder.stop();
+    if (mediaRecorder.value && mediaRecorder.value.state === 'recording') {
+      mediaRecorder.value.stop();
+    }
   });
 
-  recognition.value.continuous = true;
-  recognition.value.interimResults = false;
-  recognition.value.lang = 'en-US';
-
   recognition.value.onresult = async (event) => {
     const transcript = event.results[event.resultIndex][0].transcript.trim();
     if (transcript.length > 0) {
@@ -217,7 +217,7 @@ onMounted(async () => {
 
       if (mediaRecorder.value && mediaRecorder.value.state === 'recording') {
         mediaRecorder.value.stop();
-        await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for the mediaRecorder to finish processing the last chunk
+        await new Promise((resolve) => setTimeout(resolve, 500));
       }
 
       if (useWhisper.value) {
@@ -245,20 +245,17 @@ onMounted(async () => {
 
   recognition.value.onerror = (event) => {
     if (event.error == "no-speech") {
-      return; //This prevents the error message for no speech from popping up repeatedly.
-      //Maybe consider a counter here in the future for if no speech is is thrown more than X times we disable interact mode.
+      return;
     }
 
     errorMessage.value = `Speech recognition error: ${event.error}`;
     isLoading.value = false;
     state.value = 'error';
 
-    if (event.error === 'aborted') {
-      if (isRecording.value) {
-        setTimeout(() => {
-          recognition.value.start();
-        }, 1000);
-      }
+    if (event.error === 'aborted' && isRecording.value) {
+      setTimeout(() => {
+        recognition.value.start();
+      }, 1000);
     }
   };
 
@@ -269,18 +266,24 @@ onMounted(async () => {
       }, 1000);
     }
   };
+};
+
+onMounted(async () => {
+  if (!checkWebSpeechAPI()) {
+    errorMessage.value = 'Web Speech API is not supported in this browser.';
+    state.value = 'error';
+    return;
+  }
 
   await startRecording();
 });
 
 onUnmounted(() => {
-  if (audioContext.value) {
-    audioContext.value.close();
-  }
   if (recognition.value) {
     recognition.value.stop();
   }
   stopRecording();
+  // Don't close the AudioContext here
 });
 </script>
 

diff --git a/src/libs/utils/audio-utils.js b/src/libs/utils/audio-utils.js
@@ -1,43 +1,52 @@
-// /libs/utils/audio-utils.js
+// audio-utils.js
 import { audioQueue, audioIsPlaying } from '@/libs/state-management/state';
 
-const audioContext = new (window.AudioContext || window.webkitAudioContext)();  // Create a single AudioContext instance
+let audioContext;
+let microphone;
+let echoCanceller;
 
 const logDebug = (message) => {
   console.log(`[Audio Utils]: ${message}`);
 };
 
+const ensureAudioContext = () => {
+  if (!audioContext || audioContext.state === 'closed') {
+    audioContext = new (window.AudioContext || window.webkitAudioContext)();
+  }
+  return audioContext;
+};
+
 const playNextAudio = () => {
-  if (audioQueue.value.length > 0) {
+  if (audioQueue.value.length > 0 && !audioIsPlaying.value) {
     const audioBlob = audioQueue.value.shift();
     logDebug(`Playing next audio. Queue length: ${audioQueue.value.length}`);
 
     const reader = new FileReader();
     reader.onload = () => {
-      audioContext.decodeAudioData(reader.result, (buffer) => {
+      ensureAudioContext().decodeAudioData(reader.result, (buffer) => {
         const source = audioContext.createBufferSource();
         source.buffer = buffer;
         source.connect(audioContext.destination);
 
         source.onended = () => {
           logDebug(`Audio ended. Queue length: ${audioQueue.value.length}`);
-          audioIsPlaying.value = false;  // Reset the flag when the audio ends
-          playNextAudio();  // Recursively play the next audio
+          audioIsPlaying.value = false;
+          playNextAudio();
         };
 
         source.start(0);
         logDebug('Audio started playing.');
-        audioIsPlaying.value = true;  // Set the flag when an audio starts playing
+        audioIsPlaying.value = true;
       }, (error) => {
         logDebug(`Error decoding audio: ${error.message}`);
-        audioIsPlaying.value = false;  // Reset the flag on error
-        playNextAudio();  // Try to play the next audio
+        audioIsPlaying.value = false;
+        playNextAudio();
       });
     };
 
     reader.readAsArrayBuffer(audioBlob);
   } else {
-    logDebug('Audio queue is empty.');
+    logDebug('Audio queue is empty or audio is already playing.');
   }
 };
 
@@ -52,6 +61,29 @@ export const addToAudioQueue = (blob) => {
   }
 };
 
+export const setupEchoCancellation = async () => {
+  audioContext = ensureAudioContext();
+
+  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+  microphone = audioContext.createMediaStreamSource(stream);
+
+  echoCanceller = audioContext.createEchoModerator ? audioContext.createEchoModerator() : audioContext.createGain();
+
+  microphone.connect(echoCanceller);
+  // Do not connect echoCanceller to audioContext.destination
+
+  return { audioContext, echoCanceller, stream };
+};
+
+export const getProcessedAudioStream = () => {
+  if (!echoCanceller) {
+    throw new Error('Echo cancellation not set up. Call setupEchoCancellation first.');
+  }
+  const processorNode = audioContext.createMediaStreamDestination();
+  echoCanceller.connect(processorNode);
+  return processorNode.stream;
+};
+
 export const playAudio = (blob) => {
   logDebug('playAudio called.');
   addToAudioQueue(blob);
@@ -109,25 +141,26 @@ export const checkWebSpeechAPI = () => {
   return supported;
 };
 
-export const startMediaRecorder = async (stream, mediaRecorder, currentAudioChunks, mimeType, recognizedSentences) => {
-  mediaRecorder.value = new MediaRecorder(stream);
+export const startMediaRecorder = (stream, currentAudioChunks, mimeType, recognizedSentences) => {
+  const mediaRecorder = new MediaRecorder(stream);
 
-  mediaRecorder.value.ondataavailable = (event) => {
+  mediaRecorder.ondataavailable = (event) => {
     if (event.data.size > 0) {
       currentAudioChunks.push(event.data);
       logDebug('Audio data available.');
     }
   };
 
-  mediaRecorder.value.onstop = async () => {
+  mediaRecorder.onstop = () => {
     const blob = new Blob(currentAudioChunks, { type: mimeType });
-    currentAudioChunks = [];
+    currentAudioChunks.length = 0;
 
     if (recognizedSentences.value.length > 0) {
       recognizedSentences.value[recognizedSentences.value.length - 1].blob = blob;
       logDebug('MediaRecorder stopped. Audio blob created.');
     }
   };
 
-  logDebug('MediaRecorder started.');
+  logDebug('MediaRecorder created.');
+  return mediaRecorder;
 };