# Model Initialization & Loading

In [None]:
import time
import sounddevice as sd
from GPT_SoVITS_RT.TTS import TTS
from tools import AudioStreamer, SubtitlesQueue


tts = TTS(use_flash_attn=False)

tts.init_language_module('ja')

tts.load_gpt_model()

tts.load_sovits_model()

tts.cache_spk_audio("examples\laffey.mp3")

tts.cache_prompt_audio({
    "audio": "examples\AnAn.ogg",
    "language": "ja",
    "text": "ちが……ちがう。レイア、貴様は間違っている。",
})

subtitlesqueue = SubtitlesQueue()

# Basic Text-to-Speech

In [None]:
while True:
    text = input("infer text: ")
    if text == "":
        break

    t = time.time()
    
    res = tts.infer(
        spk_audio_path="examples\laffey.mp3",
        prompt_audio_path="examples\AnAn.ogg",
        prompt_audio_text="ちが……ちがう。レイア、貴様は間違っている。",
        prompt_audio_language="ja",
        text=text,
        text_language="auto",
    )

    audio_len_s = res["audio_len_s"]
    print(f"RTF: {(time.time() - t) / audio_len_s:.2f}")

    subtitlesqueue.add(res["subtitles"], text)

    sd.play(res["audio_data"], res["samplerate"], blocking=True)

    subtitlesqueue.add(None, None)

    print()


# Voice Conversion

In [None]:
res = tts.infer_vc(
    spk_audio_path="examples\laffey.mp3",
    prompt_audio_path="examples\AnAn.ogg",
    prompt_audio_text="ちが……ちがう。レイア、貴様は間違っている。",
    prompt_audio_language="ja",
)

print(res)
sd.play(res["audio_data"], res["samplerate"], blocking=True)

# Streaming Inference

In [None]:
streamer = AudioStreamer()

stream = sd.OutputStream(
    samplerate=32000, 
    channels=1, 
    callback=streamer.callback,
    dtype='float32'
)
stream.start()

while True:
    text = input("infer text: ")
    if text == "":
        break
    
    generator = tts.infer_stream(
        spk_audio_path="examples\laffey.mp3",
        prompt_audio_path="examples\AnAn.ogg",
        prompt_audio_text="ちが……ちがう。レイア、貴様は間違っている。",
        prompt_audio_language="ja",
        text=text,
        text_language="auto",
        boost_first_chunk=True, # If True, reduces initial latency but may introduce noise in short audio; set to False for better stability.
        debug=False,
    )

    t = time.time()
    first_chunk = True

    for audio_data in generator:
        if first_chunk:
            first_chunk = False
            print(f"TTFT: {int((time.time() - t) * 1000)}ms")

        subtitlesqueue.add(audio_data["new_subtitles"], text)
        streamer.put(audio_data["audio_data"])

    while not streamer.q.empty() or len(streamer.buffer) > 0:
        sd.sleep(100)
    
    subtitlesqueue.add(None, None)
    
    print()