feat: Implement sentence-based TTS streaming in interact mode

- Add sentence-utils.js with naive sentence completion algorithm - Modify readResponseStream to process sentences in real-time - Integrate TTS fetching for completed sentences - Utilize audio queue to prevent overlapping playback This commit introduces a significant improvement to the interact mode, enabling faster voice-based interactions by processing and sending completed sentences to the OpenAI TTS endpoint as they become available in the streamed response. Details: - Implement getCompleteSentences function in sentence-utils.js - Update readResponseStream to use sentence-based processing - Add logic to track and process only new sentences - Integrate with existing audio queue for smooth playback - Handle potential errors in TTS fetching Performance impact: - Reduces latency in voice responses during interact mode
fingerthief · Jun 23, 2024 · 28d3b69 · 28d3b69
1 parent 8f2d16c
commit 28d3b69
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 18 deletions.
diff --git a/src/libs/api-access/gpt-api-access.js b/src/libs/api-access/gpt-api-access.js
@@ -1,8 +1,8 @@
 import { showToast, sleep, parseStreamResponseChunk, handleTextStreamEnd } from '../utils/general-utils';
 import { updateUI } from '../utils/general-utils';
 import { playAudio } from '../utils/audio-utils';
-
-import { whisperTemperature, audioSpeed, ttsModel, ttsVoice, messages, pushToTalkMode } from '../state-management/state';
+import { getCompleteSentences } from '../utils/sentence-utils';
+import { whisperTemperature, audioSpeed, ttsModel, ttsVoice, messages, isInteractModeOpen } from '../state-management/state';
 
 import { addMessage } from '../conversation-management/message-processing';
 const MAX_RETRY_ATTEMPTS = 5;
@@ -87,8 +87,6 @@ export async function fetchGPTResponseStream(
   streamedMessageText,
   autoScrollToBottom = true
 ) {
-  //const gptMessagesOnly = filterGPTMessages(conversation);
-
   let tempMessages = conversation.map((message) => ({
     role: message.role,
     content: message.content,
@@ -108,13 +106,10 @@ export async function fetchGPTResponseStream(
     }),
     signal: abortController.signal,
   };
-  let result;
 
   try {
     const response = await fetch('https://api.openai.com/v1/chat/completions', requestOptions);
-
-    result = await readResponseStream(response, updateUiFunction, autoScrollToBottom);
-
+    const result = await readResponseStream(response, updateUiFunction, autoScrollToBottom);
     return result;
   } catch (error) {
     if (error.name === 'AbortError') {
@@ -132,6 +127,8 @@ async function readResponseStream(response, updateUiFunction, autoScrollToBottom
   const reader = response.body.getReader();
   const decoder = new TextDecoder('utf-8');
   let decodedResult = '';
+  let buffer = '';
+  let processedSentences = new Set();
 
   while (true) {
     const { done, value } = await reader.read();
@@ -148,13 +145,44 @@ async function readResponseStream(response, updateUiFunction, autoScrollToBottom
       ],
     } of parsedLines) {
       if (content) {
+        buffer += content;
         decodedResult += content;
+
         updateUI(content, messages.value, addMessage, autoScrollToBottom);
+
+        if (isInteractModeOpen.value) {
+          const sentences = getCompleteSentences(buffer);
+
+          for (const sentence of sentences) {
+            if (!processedSentences.has(sentence) && sentence.length <= 4096) {
+              processedSentences.add(sentence);
+              try {
+                await fetchTTSResponse(sentence);
+              } catch (error) {
+                console.error('Error fetching TTS response:', error);
+              }
+            }
+          }
+          buffer = buffer.slice(sentences.join('').length);
+        }
       }
     }
   }
 
-  handleTextStreamEnd(decodedResult);
+  // Process any remaining content in the buffer
+  if (buffer.length > 0 && isInteractModeOpen.value) {
+    const sentences = getCompleteSentences(buffer);
+    for (const sentence of sentences) {
+      if (!processedSentences.has(sentence) && sentence.length <= 4096) {
+        processedSentences.add(sentence);
+        try {
+          await fetchTTSResponse(sentence);
+        } catch (error) {
+          console.error('Error fetching TTS response:', error);
+        }
+      }
+    }
+  }
 
   return decodedResult;
 }
@@ -168,15 +196,20 @@ export async function fetchTTSResponse(text) {
   }
 
   try {
+    if (text.length > 4096) {
+      console.error(`[TTS]: Input text exceeds 4096 characters.`);
+      return;
+    }
+
     const response = await fetch('https://api.openai.com/v1/audio/speech', {
       method: 'POST',
       headers: {
         'Authorization': `Bearer ${apiKey}`,
         'Content-Type': 'application/json'
       },
       body: JSON.stringify({
-        model: ttsModel.value, // Adding the model parameter as required
-        input: text,    // Changing 'text' to 'input' as required
+        model: ttsModel.value,
+        input: text,
         voice: ttsVoice.value,
         speed: audioSpeed.value
       })
@@ -187,9 +220,8 @@ export async function fetchTTSResponse(text) {
       throw new Error(`Error from TTS API: ${errorText}`);
     }
 
-    const audioBlob = await response.blob(); // Get the audio content as a blob
-    console.log(`[TTS]: Received audio blob, length: ${audioBlob.size}`);
-    playAudio(audioBlob); // Ensure this function is called
+    const audioBlob = await response.blob();
+    playAudio(audioBlob);
   } catch (error) {
     console.error(`[TTS]: Error fetching TTS response: ${error.message}`);
   }

diff --git a/src/libs/utils/general-utils.js b/src/libs/utils/general-utils.js
@@ -252,12 +252,12 @@ export function swipedRight(event) {
 export async function handleTextStreamEnd(message) {
   if (isInteractModeOpen.value) {
     try {
-      // Call the fetchTTSResponse with "message" and play the result
-      await fetchTTSResponse(message);
+      if (message.length <= 4096) {
+        await fetchTTSResponse(message);
+      }
     } catch (error) {
       console.error('Error with TTS Response:', error);
-    }
-    finally {
+    } finally {
       if (pushToTalkMode.value) {
         isInteractModeOpen.value = false;
       }

diff --git a/src/libs/utils/sentence-utils.js b/src/libs/utils/sentence-utils.js
@@ -0,0 +1,35 @@
+export function getCompleteSentences(buffer) {
+  const sentences = [];
+  let currentSentence = '';
+  const sentenceEndings = ['.', '!', '?'];
+  let inQuotes = false;
+  let bracketDepth = 0;
+
+  for (let i = 0; i < buffer.length; i++) {
+    const char = buffer[i];
+    currentSentence += char;
+
+    // Handle quotes
+    if (char === '"') {
+      inQuotes = !inQuotes;
+    }
+
+    // Handle brackets
+    if (char === '(' || char === '[' || char === '{') {
+      bracketDepth++;
+    } else if (char === ')' || char === ']' || char === '}') {
+      bracketDepth = Math.max(0, bracketDepth - 1);
+    }
+
+    // Check for sentence end
+    if (sentenceEndings.includes(char) && 
+        !inQuotes && 
+        bracketDepth === 0 &&
+        (i === buffer.length - 1 || /\s/.test(buffer[i + 1]))) {
+      sentences.push(currentSentence.trim());
+      currentSentence = '';
+    }
+  }
+
+  return sentences;
+}