Skip to content

Commit

Permalink
Supports TTS (#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
conor-f committed Sep 25, 2023
1 parent 04048de commit a51fcde
Show file tree
Hide file tree
Showing 7 changed files with 300 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,4 @@ cython_debug/

# General personal scripts
scripts/
*.mp3
1 change: 1 addition & 0 deletions fia_api/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class Settings(BaseSettings):
jwt_refresh_secret_key: str = "jwt_refresh_secret_key"

openai_api_key: str = "INVALID_OPENAI_API_KEY"
google_cloud_api_key_path: str = "INVALID_GOOGLE_CLOUD_API_KEY_PATH"

get_learning_moments_prompt: str = """You are an expert German language teacher who works with native English speakers to help them learn German. You analyze messages they send you and you find any mistakes they have made in them. You pay particular attention to grammar mistakes. If the user uses the English language in their sentence, help them by translating it into German. You are focusing on a colloquial spoken German style, and not formal written German."""
conversation_continuation_prompt: str = """You are a native German speaker who is helping an English speaker learn to speak German. They are a beginner and want to try have a conversation only in German with you. Sometimes they make spelling/grammar mistakes, but you always try to continue the conversation with them. You are friendly and ask questions to direct the conversation to help the user learn. You are allowed to use English if the user asks you what a word means, and to explain difficult words, but you don't have to. Apart from that, you only respond in German. You speak at a very basic level so the user can understand you. Some things you can use to get a conversation started with a user include: - Asking how their day has been. - Talking to them about their hobbies. - Playing the game Twenty Questions. - Asking them if they want to pretend to order something in a cafe. - etc"""
Expand Down
7 changes: 7 additions & 0 deletions fia_api/web/api/teacher/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,10 @@ class ConverseResponse(BaseModel):
learning_moments: LearningMoments
input_message: str
conversation_response: str


class GetAudioRequest(BaseModel):
"""Request to the get-audio endpoint."""

text: str
# TODO: Add features like language and speaker type?
34 changes: 34 additions & 0 deletions fia_api/web/api/teacher/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# noqa: WPS462
import json
import os
import uuid
from typing import Any, Dict, List

import openai
from fastapi import UploadFile
from google.cloud import texttospeech
from loguru import logger

from fia_api.db.models.conversation_model import (
Expand Down Expand Up @@ -304,9 +306,41 @@ async def get_text_from_audio(audio_file: UploadFile) -> str:
:return: String text.
"""
# TODO: Shouldn't have to do this dance with writing/reading the file!
# This may be related to with not working with async...
with open("/tmp/whatever.wav", "wb") as tmp_fh: # noqa: S108
tmp_fh.write(audio_file.file.read())

with open("/tmp/whatever.wav", "rb") as in_fh: # noqa: S108
# TODO: Store the token usage too
return openai.Audio.transcribe("whisper-1", in_fh)["text"]


# TODO: Make this bytes or whatever.
def get_audio_stream_from_text(text: str) -> Any:
"""
Given some text, return a byte stream of the audio as MP3.
:param text: String text to convert.
:yields: MP3 audio stream.
"""
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = settings.google_cloud_api_key_path

client = texttospeech.TextToSpeechClient()
synthesis_input = texttospeech.SynthesisInput(text=text)

voice = texttospeech.VoiceSelectionParams(
language_code="de",
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL,
)

audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
)

response = client.synthesize_speech(
input=synthesis_input,
voice=voice,
audio_config=audio_config,
)

yield response.audio_content
29 changes: 28 additions & 1 deletion fia_api/web/api/teacher/views.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from fastapi import APIRouter, Depends, UploadFile
from fastapi.responses import StreamingResponse

from fia_api.db.models.user_model import UserModel
from fia_api.web.api.teacher.schema import ConverseResponse, TeacherConverseRequest
from fia_api.web.api.teacher.schema import (
ConverseResponse,
GetAudioRequest,
TeacherConverseRequest,
)
from fia_api.web.api.teacher.utils import (
get_audio_stream_from_text,
get_response,
get_text_from_audio,
initialize_conversation,
Expand Down Expand Up @@ -55,6 +61,7 @@ async def converse_with_audio(
# TODO: Should be the same endpoint as above.
# TODO: For some reason POST vars and File uploads are a mess. Fix all of
# this nonsense.
# This is because the conversation_id is passed as a str, not a model.
message = await get_text_from_audio(audio_file)

if conversation_id == "new":
Expand All @@ -68,3 +75,23 @@ async def converse_with_audio(
message,
await UserModel.get(username=user.username),
)


@router.post("/get-audio")
def get_audio(
audio_request: GetAudioRequest,
user: AuthenticatedUser = Depends(get_current_user),
) -> StreamingResponse:
"""
Given some text and metadata, return the mp3.
:param audio_request: The details of the request.
:param user: The AuthenticatedUser making the request.
:returns: GetAudioResponse.
"""
audio_stream = get_audio_stream_from_text(audio_request.text)

return StreamingResponse(
audio_stream,
media_type="audio/mpeg",
)
Loading

0 comments on commit a51fcde

Please sign in to comment.