In [1]:
# Verify GPU is available
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

PyTorch version: 2.10.0+cu128
CUDA available: True
GPU: NVIDIA GeForce RTX 3080 Ti
VRAM: 12.5 GB


In [2]:
import torch
import soundfile as sf
from qwen_tts import Qwen3TTSModel
from IPython.display import Audio, display, Markdown
import os

# Create output directory
os.makedirs("audio_outputs", exist_ok=True)

# Load the model
print("Loading Qwen3-TTS model... (this may take a few minutes on first run)")

# Check if flash_attention_2 is available
try:
    import flash_attn
    attn_impl = "flash_attention_2"
    print("‚úÖ Using Flash Attention 2 for faster inference")
except ImportError:
    attn_impl = "eager"
    print("‚ö†Ô∏è Flash Attention not available, using standard attention")

model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation=attn_impl,
)

print("\n‚úÖ Model loaded successfully!")

  from .autonotebook import tqdm as notebook_tqdm
/bin/sh: 1: sox: not found
SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    


Loading Qwen3-TTS model... (this may take a few minutes on first run)
‚úÖ Using Flash Attention 2 for faster inference


You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Fetching 4 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:57<00:00, 14.32s/it]



‚úÖ Model loaded successfully!


In [8]:
# Get supported speakers and languages
speakers = model.get_supported_speakers()
languages = model.get_supported_languages()

print("üé§ AVAILABLE SPEAKERS:")
print("=" * 60)
speaker_info = {
    "Vivian": ("Female", "Chinese", "Bright, slightly edgy young female voice"),
    "Serena": ("Female", "Chinese", "Warm, gentle young female voice"),
    "Uncle_Fu": ("Male", "Chinese", "Seasoned male voice with a low, mellow timbre"),
    "Dylan": ("Male", "Chinese (Beijing)", "Youthful Beijing male voice, clear, natural"),
    "Eric": ("Male", "Chinese (Sichuan)", "Lively Chengdu male voice, slightly husky"),
    "Ryan": ("Male", "English", "Dynamic male voice with strong drive"),
    "Aiden": ("Male", "English", "Sunny American male voice with clear midrange"),
    "Ono_Anna": ("Female", "Japanese", "Playful female voice with light, nimble timbre"),
    "Sohee": ("Female", "Korean", "Warm female voice with rich emotion"),
}

for speaker in speakers:
    if speaker in speaker_info:
        gender, native_lang, desc = speaker_info[speaker]
        print(f"\n  üìå {speaker}")
        print(f"     Gender: {gender} | Native: {native_lang}")
        print(f"     Description: {desc}")

print("\n" + "=" * 60)
print(f"\nüåç SUPPORTED LANGUAGES ({len(languages)}):")
print(", ".join(languages))

üé§ AVAILABLE SPEAKERS:


üåç SUPPORTED LANGUAGES (11):
auto, chinese, english, french, german, italian, japanese, korean, portuguese, russian, spanish


In [9]:
def generate_and_play(text, language, speaker, filename, instruct=None):
    """Helper function to generate audio and display it"""
    print(f"üéôÔ∏è Generating: \"{text[:50]}{'...' if len(text) > 50 else ''}\"")
    print(f"   Speaker: {speaker} | Language: {language}")
    if instruct:
        print(f"   Instruction: {instruct}")

    wavs, sr = model.generate_custom_voice(
        text=text,
        language=language,
        speaker=speaker,
        instruct=instruct if instruct else "",
    )

    filepath = f"audio_outputs/{filename}.wav"
    sf.write(filepath, wavs[0], sr)
    print(f"   ‚úÖ Saved to {filepath}")

    display(Audio(wavs[0], rate=sr))
    return wavs, sr

In [10]:
# Basic English TTS
generate_and_play(
    text="Hello! Welcome to the Qwen3 Text-to-Speech demonstration. This model can generate natural, expressive speech in multiple languages.",
    language="English",
    speaker="Ryan",
    filename="01_basic_english_ryan"
)

# Basic English TTS
generate_and_play(
    text="Hello! Welcome to the Qwen3 Text-to-Speech demonstration. This model can generate natural, expressive speech in multiple languages.",
    language="English",
    speaker="Aiden",
    filename="01_basic_english_aiden"
)

Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


üéôÔ∏è Generating: "Hello! Welcome to the Qwen3 Text-to-Speech demonst..."
   Speaker: Ryan | Language: English
   ‚úÖ Saved to audio_outputs/01_basic_english_ryan.wav


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


üéôÔ∏è Generating: "Hello! Welcome to the Qwen3 Text-to-Speech demonst..."
   Speaker: Aiden | Language: English
   ‚úÖ Saved to audio_outputs/01_basic_english_aiden.wav


([array([ 2.0027161e-05,  1.8954277e-05, -8.9406967e-08, ...,
         -1.2934208e-05,  2.3841858e-05,  2.3961067e-05],
        shape=(259200,), dtype=float32)],
 24000)