# Model Initialization & Loading

In [1]:
import time
import sounddevice as sd
from GPT_SoVITS_RT.TTS import TTS
from tools import AudioStreamer, SubtitlesQueue


tts = TTS(use_flash_attn=False)

tts.init_language_module('ja')

tts.load_gpt_model()

tts.load_sovits_model()

tts.cache_spk_audio("examples\laffey.mp3")

tts.cache_prompt_audio({
    "audio": "examples\AnAn.ogg",
    "language": "ja",
    "text": "ちが……ちがう。レイア、貴様は間違っている。",
})

subtitlesqueue = SubtitlesQueue()

2026-01-10 20:11:14,734 - TTS.py - INFO: Device: cuda:0
2026-01-10 20:11:14,734 - TTS.py - INFO: Half: True, dtype: torch.float16
2026-01-10 20:11:14,752 - TTS.py - INFO: Loaded language module: ja
2026-01-10 20:11:15,574 - TTS.py - INFO: Loaded GPT model: pretrained_models/s1v3.ckpt
2026-01-10 20:11:17,919 - TTS.py - INFO: Loaded SoVITS model: pretrained_models/v2Pro/s2Gv2ProPlus.pth
2026-01-10 20:11:19,357 - TTS.py - INFO: Cached speaker audio: examples\laffey.mp3
2026-01-10 20:11:21,881 - TTS.py - INFO: Cached prompt audio: examples\AnAn.ogg


# Basic Text-to-Speech

In [2]:
while True:
    text = input("infer text: ")
    if text == "":
        break

    t = time.time()
    
    res = tts.infer(
        spk_audio_path="examples\laffey.mp3",
        prompt_audio_path="examples\AnAn.ogg",
        prompt_audio_text="ちが……ちがう。レイア、貴様は間違っている。",
        prompt_audio_language="ja",
        text=text,
        text_language="auto",
    )

    audio_len_s = res["audio_len_s"]
    print(f"RTF: {(time.time() - t) / audio_len_s:.2f}")

    subtitlesqueue.add(res["subtitles"], text)

    sd.play(res["audio_data"], res["samplerate"], blocking=True)

    subtitlesqueue.add(None, None)

    print()


# Voice Conversion

In [3]:
res = tts.infer_vc(
    spk_audio_path="examples\laffey.mp3",
    prompt_audio_path="examples\AnAn.ogg",
    prompt_audio_text="ちが……ちがう。レイア、貴様は間違っている。",
    prompt_audio_language="ja",
)

print(res)
sd.play(res["audio_data"], res["samplerate"], blocking=True)

2026-01-10 20:11:24,587 - TTS.py - INFO: Starting VC inference. Prompt audio: examples\AnAn.ogg
2026-01-10 20:11:24,589 - TTS.py - INFO: Using SoVITS model: pretrained_models/v2Pro/s2Gv2ProPlus.pth
2026-01-10 20:11:25,836 - TTS.py - INFO: VC Inference complete. Generated 7.28s of audio.


{'audio_data': array([ 2.0563602e-05, -2.0861626e-06, -7.9870224e-06, ...,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
      shape=(232960,), dtype=float32), 'samplerate': 32000, 'audio_len_s': 7.28, 'subtitles': [{'text': 'ち', 'start_s': 0, 'end_s': 0.4, 'orig_idx_start': 0, 'orig_idx_end': 1}, {'text': 'が', 'start_s': 0.4, 'end_s': 0.66, 'orig_idx_start': 1, 'orig_idx_end': 2}, {'text': '…', 'start_s': 0.66, 'end_s': 1.6600000000000001, 'orig_idx_start': 3, 'orig_idx_end': 4}, {'text': 'ち', 'start_s': 1.6600000000000001, 'end_s': 2.02, 'orig_idx_start': 4, 'orig_idx_end': 5}, {'text': 'が', 'start_s': 2.02, 'end_s': 2.2600000000000002, 'orig_idx_start': 5, 'orig_idx_end': 6}, {'text': 'う', 'start_s': 2.2600000000000002, 'end_s': 2.46, 'orig_idx_start': 6, 'orig_idx_end': 7}, {'text': '。', 'start_s': 2.46, 'end_s': 3.08, 'orig_idx_start': 7, 'orig_idx_end': 8}, {'text': 'レ', 'start_s': 3.08, 'end_s': 3.3200000000000003, 'orig_idx_start': 8, 'orig_idx_end': 9}, {'text': 'イ',

# Streaming Inference

In [None]:
streamer = AudioStreamer()

stream = sd.OutputStream(
    samplerate=32000, 
    channels=1, 
    callback=streamer.callback,
    dtype='float32'
)
stream.start()

while True:
    text = input("infer text: ")
    if text == "":
        break
    
    generator = tts.infer_stream(
        spk_audio_path="examples\laffey.mp3",
        prompt_audio_path="examples\AnAn.ogg",
        prompt_audio_text="ちが……ちがう。レイア、貴様は間違っている。",
        prompt_audio_language="ja",
        text=text,
        text_language="auto",
        boost_first_chunk=True, # If True, reduces initial latency but may introduce noise in short audio; set to False for better stability.
        debug=False,
    )

    t = time.time()
    first_chunk = True

    for audio_data in generator:
        if first_chunk:
            first_chunk = False
            print(f"TTFT: {int((time.time() - t) * 1000)}ms")

        subtitlesqueue.add(audio_data["new_subtitles"], text)
        streamer.put(audio_data["audio_data"])

    while not streamer.q.empty() or len(streamer.buffer) > 0:
        sd.sleep(100)
    
    subtitlesqueue.add(None, None)
    
    print()