Skip to content

Commit

Permalink
feat: Implement echo cancellation and improve audio handling
Browse files Browse the repository at this point in the history
- Add echo cancellation to reduce false triggers from TTS output
- Separate input (microphone) and output (TTS) audio paths
- Refactor audio utilities for better modularity and reusability

Changes:
- Add setupEchoCancellation function in audio-utils.js
- Implement getProcessedAudioStream for echo-cancelled audio
- Modify startRecording in InteractMode.vue to use echo cancellation
- Update playAudio and addToAudioQueue for improved audio management
- Ensure AudioContext persistence across component lifecycle

This commit aim to improve the voice interaction experience by
reducing self-triggering and enhancing audio playback control.
  • Loading branch information
o-stahl committed Jun 23, 2024
1 parent 28d3b69 commit c62835d
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 62 deletions.
95 changes: 49 additions & 46 deletions src/components/controls/InteractMode.vue
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,32 @@
import { ref, onMounted, onUnmounted, nextTick } from 'vue';
import { fetchSTTResponse } from '@/libs/api-access/gpt-api-access';
import { useWhisper, isInteractModeOpen } from '@/libs/state-management/state';
import { getSupportedMimeType, checkMicrophoneAvailability, checkWebSpeechAPI, startMediaRecorder, playAudio, downloadAudio } from '@/libs/utils/audio-utils';
import {
getSupportedMimeType,
checkMicrophoneAvailability,
checkWebSpeechAPI,
startMediaRecorder,
playAudio,
downloadAudio,
setupEchoCancellation,
getProcessedAudioStream
} from '@/libs/utils/audio-utils';
const emit = defineEmits(['recognized-sentence', 'close-interact-mode']);
const isRecording = ref(false);
const isLoading = ref(false);
const errorMessage = ref(null);
const audioContext = ref(null);
const analyser = ref(null);
const dataArray = ref(null);
const mediaRecorder = ref(null);
const recognizedSentences = ref([]);
let currentAudioChunks = [];
let vadStream = null;
const wavePath = ref('');
const state = ref('listening');
const mimeType = getSupportedMimeType();
const recognition = ref(null);
let audioContextRef = null;
const stopRecording = () => {
if (mediaRecorder.value && mediaRecorder.value.state !== 'inactive') {
Expand All @@ -56,11 +65,6 @@ const stopRecording = () => {
recognition.value.stop();
}
if (vadStream) {
vadStream.getTracks().forEach(track => track.stop());
vadStream = null;
}
isRecording.value = false;
};
Expand Down Expand Up @@ -143,18 +147,25 @@ const startRecording = async () => {
}
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
audioContext.value = new (window.AudioContext || window.webkitAudioContext)();
analyser.value = audioContext.value.createAnalyser();
const source = audioContext.value.createMediaStreamSource(stream);
const { audioContext, echoCanceller, stream } = await setupEchoCancellation();
audioContextRef = audioContext;
const processedStream = getProcessedAudioStream();
analyser.value = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(processedStream);
source.connect(analyser.value);
analyser.value.fftSize = 2048;
const bufferLength = analyser.value.frequencyBinCount;
dataArray.value = new Uint8Array(bufferLength);
vadStream = stream;
dataArray.value = new Uint8Array(analyser.value.frequencyBinCount);
isRecording.value = true;
recognition.value = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
recognition.value.continuous = true;
recognition.value.interimResults = false;
recognition.value.lang = 'en-US';
setupRecognitionEventListeners(processedStream);
recognition.value.start();
state.value = 'listening';
Expand All @@ -163,23 +174,14 @@ const startRecording = async () => {
isLoading.value = false;
} catch (error) {
console.error('Error in startRecording:', error);
errorMessage.value = 'Error starting recording or speech recognition.';
state.value = 'error';
isLoading.value = false;
}
};
const recognition = ref(null);
onMounted(async () => {
if (!checkWebSpeechAPI()) {
errorMessage.value = 'Web Speech API is not supported in this browser.';
state.value = 'error';
return;
}
recognition.value = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
const setupRecognitionEventListeners = (stream) => {
recognition.value.onspeechstart = () => {
drawAudioWaveform();
console.log("Speech has been detected");
Expand All @@ -188,9 +190,9 @@ onMounted(async () => {
return;
}
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
startMediaRecorder(stream, mediaRecorder, currentAudioChunks, mimeType, recognizedSentences);
mediaRecorder.start();
if (!mediaRecorder.value || mediaRecorder.value.state === 'inactive') {
mediaRecorder.value = startMediaRecorder(stream, currentAudioChunks, mimeType, recognizedSentences);
mediaRecorder.value.start();
}
};
Expand All @@ -202,13 +204,11 @@ onMounted(async () => {
return;
}
mediaRecorder.stop();
if (mediaRecorder.value && mediaRecorder.value.state === 'recording') {
mediaRecorder.value.stop();
}
});
recognition.value.continuous = true;
recognition.value.interimResults = false;
recognition.value.lang = 'en-US';
recognition.value.onresult = async (event) => {
const transcript = event.results[event.resultIndex][0].transcript.trim();
if (transcript.length > 0) {
Expand All @@ -217,7 +217,7 @@ onMounted(async () => {
if (mediaRecorder.value && mediaRecorder.value.state === 'recording') {
mediaRecorder.value.stop();
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for the mediaRecorder to finish processing the last chunk
await new Promise((resolve) => setTimeout(resolve, 500));
}
if (useWhisper.value) {
Expand Down Expand Up @@ -245,20 +245,17 @@ onMounted(async () => {
recognition.value.onerror = (event) => {
if (event.error == "no-speech") {
return; //This prevents the error message for no speech from popping up repeatedly.
//Maybe consider a counter here in the future for if no speech is is thrown more than X times we disable interact mode.
return;
}
errorMessage.value = `Speech recognition error: ${event.error}`;
isLoading.value = false;
state.value = 'error';
if (event.error === 'aborted') {
if (isRecording.value) {
setTimeout(() => {
recognition.value.start();
}, 1000);
}
if (event.error === 'aborted' && isRecording.value) {
setTimeout(() => {
recognition.value.start();
}, 1000);
}
};
Expand All @@ -269,18 +266,24 @@ onMounted(async () => {
}, 1000);
}
};
};
onMounted(async () => {
if (!checkWebSpeechAPI()) {
errorMessage.value = 'Web Speech API is not supported in this browser.';
state.value = 'error';
return;
}
await startRecording();
});
onUnmounted(() => {
if (audioContext.value) {
audioContext.value.close();
}
if (recognition.value) {
recognition.value.stop();
}
stopRecording();
// Don't close the AudioContext here
});
</script>

Expand Down
65 changes: 49 additions & 16 deletions src/libs/utils/audio-utils.js
Original file line number Diff line number Diff line change
@@ -1,43 +1,52 @@
// /libs/utils/audio-utils.js
// audio-utils.js
import { audioQueue, audioIsPlaying } from '@/libs/state-management/state';

const audioContext = new (window.AudioContext || window.webkitAudioContext)(); // Create a single AudioContext instance
let audioContext;
let microphone;
let echoCanceller;

const logDebug = (message) => {
console.log(`[Audio Utils]: ${message}`);
};

const ensureAudioContext = () => {
if (!audioContext || audioContext.state === 'closed') {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
}
return audioContext;
};

const playNextAudio = () => {
if (audioQueue.value.length > 0) {
if (audioQueue.value.length > 0 && !audioIsPlaying.value) {
const audioBlob = audioQueue.value.shift();
logDebug(`Playing next audio. Queue length: ${audioQueue.value.length}`);

const reader = new FileReader();
reader.onload = () => {
audioContext.decodeAudioData(reader.result, (buffer) => {
ensureAudioContext().decodeAudioData(reader.result, (buffer) => {
const source = audioContext.createBufferSource();
source.buffer = buffer;
source.connect(audioContext.destination);

source.onended = () => {
logDebug(`Audio ended. Queue length: ${audioQueue.value.length}`);
audioIsPlaying.value = false; // Reset the flag when the audio ends
playNextAudio(); // Recursively play the next audio
audioIsPlaying.value = false;
playNextAudio();
};

source.start(0);
logDebug('Audio started playing.');
audioIsPlaying.value = true; // Set the flag when an audio starts playing
audioIsPlaying.value = true;
}, (error) => {
logDebug(`Error decoding audio: ${error.message}`);
audioIsPlaying.value = false; // Reset the flag on error
playNextAudio(); // Try to play the next audio
audioIsPlaying.value = false;
playNextAudio();
});
};

reader.readAsArrayBuffer(audioBlob);
} else {
logDebug('Audio queue is empty.');
logDebug('Audio queue is empty or audio is already playing.');
}
};

Expand All @@ -52,6 +61,29 @@ export const addToAudioQueue = (blob) => {
}
};

export const setupEchoCancellation = async () => {
audioContext = ensureAudioContext();

const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
microphone = audioContext.createMediaStreamSource(stream);

echoCanceller = audioContext.createEchoModerator ? audioContext.createEchoModerator() : audioContext.createGain();

microphone.connect(echoCanceller);
// Do not connect echoCanceller to audioContext.destination

return { audioContext, echoCanceller, stream };
};

export const getProcessedAudioStream = () => {
if (!echoCanceller) {
throw new Error('Echo cancellation not set up. Call setupEchoCancellation first.');
}
const processorNode = audioContext.createMediaStreamDestination();
echoCanceller.connect(processorNode);
return processorNode.stream;
};

export const playAudio = (blob) => {
logDebug('playAudio called.');
addToAudioQueue(blob);
Expand Down Expand Up @@ -109,25 +141,26 @@ export const checkWebSpeechAPI = () => {
return supported;
};

export const startMediaRecorder = async (stream, mediaRecorder, currentAudioChunks, mimeType, recognizedSentences) => {
mediaRecorder.value = new MediaRecorder(stream);
export const startMediaRecorder = (stream, currentAudioChunks, mimeType, recognizedSentences) => {
const mediaRecorder = new MediaRecorder(stream);

mediaRecorder.value.ondataavailable = (event) => {
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
currentAudioChunks.push(event.data);
logDebug('Audio data available.');
}
};

mediaRecorder.value.onstop = async () => {
mediaRecorder.onstop = () => {
const blob = new Blob(currentAudioChunks, { type: mimeType });
currentAudioChunks = [];
currentAudioChunks.length = 0;

if (recognizedSentences.value.length > 0) {
recognizedSentences.value[recognizedSentences.value.length - 1].blob = blob;
logDebug('MediaRecorder stopped. Audio blob created.');
}
};

logDebug('MediaRecorder started.');
logDebug('MediaRecorder created.');
return mediaRecorder;
};

0 comments on commit c62835d

Please sign in to comment.