Supports TTS (#32)

conor-f · Sep 25, 2023 · a51fcde · a51fcde
1 parent 04048de
commit a51fcde
Show file tree

Hide file tree

Showing 7 changed files with 300 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -144,3 +144,4 @@ cython_debug/
 
 # General personal scripts
 scripts/
+*.mp3
diff --git a/fia_api/settings.py b/fia_api/settings.py
@@ -66,6 +66,7 @@ class Settings(BaseSettings):
     jwt_refresh_secret_key: str = "jwt_refresh_secret_key"
 
     openai_api_key: str = "INVALID_OPENAI_API_KEY"
+    google_cloud_api_key_path: str = "INVALID_GOOGLE_CLOUD_API_KEY_PATH"
 
     get_learning_moments_prompt: str = """You are an expert German language teacher who works with native English speakers to help them learn German.  You analyze messages they send you and you find any mistakes they have made in them. You pay particular attention to grammar mistakes. If the user uses the English language in their sentence, help them by translating it into German. You are focusing on a colloquial spoken German style, and not formal written German."""
     conversation_continuation_prompt: str = """You are a native German speaker who is helping an English speaker learn to speak German. They are a beginner and want to try have a conversation only in German with you.  Sometimes they make spelling/grammar mistakes, but you always try to continue the conversation with them. You are friendly and ask questions to direct the conversation to help the user learn. You are allowed to use English if the user asks you what a word means, and to explain difficult words, but you don't have to. Apart from that, you only respond in German. You speak at a very basic level so the user can understand you. Some things you can use to get a conversation started with a user include: - Asking how their day has been. - Talking to them about their hobbies. - Playing the game Twenty Questions. - Asking them if they want to pretend to order something in a cafe. - etc"""

diff --git a/fia_api/web/api/teacher/schema.py b/fia_api/web/api/teacher/schema.py
@@ -134,3 +134,10 @@ class ConverseResponse(BaseModel):
     learning_moments: LearningMoments
     input_message: str
     conversation_response: str
+
+
+class GetAudioRequest(BaseModel):
+    """Request to the get-audio endpoint."""
+
+    text: str
+    # TODO: Add features like language and speaker type?
diff --git a/fia_api/web/api/teacher/utils.py b/fia_api/web/api/teacher/utils.py
@@ -1,10 +1,12 @@
 # noqa: WPS462
 import json
+import os
 import uuid
 from typing import Any, Dict, List
 
 import openai
 from fastapi import UploadFile
+from google.cloud import texttospeech
 from loguru import logger
 
 from fia_api.db.models.conversation_model import (
@@ -304,9 +306,41 @@ async def get_text_from_audio(audio_file: UploadFile) -> str:
     :return: String text.
     """
     # TODO: Shouldn't have to do this dance with writing/reading the file!
+    #   This may be related to with not working with async...
     with open("/tmp/whatever.wav", "wb") as tmp_fh:  # noqa: S108
         tmp_fh.write(audio_file.file.read())
 
     with open("/tmp/whatever.wav", "rb") as in_fh:  # noqa: S108
         # TODO: Store the token usage too
         return openai.Audio.transcribe("whisper-1", in_fh)["text"]
+
+
+# TODO: Make this bytes or whatever.
+def get_audio_stream_from_text(text: str) -> Any:
+    """
+    Given some text, return a byte stream of the audio as MP3.
+
+    :param text: String text to convert.
+    :yields: MP3 audio stream.
+    """
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = settings.google_cloud_api_key_path
+
+    client = texttospeech.TextToSpeechClient()
+    synthesis_input = texttospeech.SynthesisInput(text=text)
+
+    voice = texttospeech.VoiceSelectionParams(
+        language_code="de",
+        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL,
+    )
+
+    audio_config = texttospeech.AudioConfig(
+        audio_encoding=texttospeech.AudioEncoding.MP3,
+    )
+
+    response = client.synthesize_speech(
+        input=synthesis_input,
+        voice=voice,
+        audio_config=audio_config,
+    )
+
+    yield response.audio_content
diff --git a/fia_api/web/api/teacher/views.py b/fia_api/web/api/teacher/views.py
@@ -1,8 +1,14 @@
 from fastapi import APIRouter, Depends, UploadFile
+from fastapi.responses import StreamingResponse
 
 from fia_api.db.models.user_model import UserModel
-from fia_api.web.api.teacher.schema import ConverseResponse, TeacherConverseRequest
+from fia_api.web.api.teacher.schema import (
+    ConverseResponse,
+    GetAudioRequest,
+    TeacherConverseRequest,
+)
 from fia_api.web.api.teacher.utils import (
+    get_audio_stream_from_text,
     get_response,
     get_text_from_audio,
     initialize_conversation,
@@ -55,6 +61,7 @@ async def converse_with_audio(
     # TODO: Should be the same endpoint as above.
     # TODO: For some reason POST vars and File uploads are a mess. Fix all of
     # this nonsense.
+    #   This is because the conversation_id is passed as a str, not a model.
     message = await get_text_from_audio(audio_file)
 
     if conversation_id == "new":
@@ -68,3 +75,23 @@ async def converse_with_audio(
         message,
         await UserModel.get(username=user.username),
     )
+
+
+@router.post("/get-audio")
+def get_audio(
+    audio_request: GetAudioRequest,
+    user: AuthenticatedUser = Depends(get_current_user),
+) -> StreamingResponse:
+    """
+    Given some text and metadata, return the mp3.
+
+    :param audio_request: The details of the request.
+    :param user: The AuthenticatedUser making the request.
+    :returns: GetAudioResponse.
+    """
+    audio_stream = get_audio_stream_from_text(audio_request.text)
+
+    return StreamingResponse(
+        audio_stream,
+        media_type="audio/mpeg",
+    )