In [1]:
# Verify GPU and imports
import torch
import soundfile as sf
import os
from IPython.display import Audio, display, Markdown, HTML

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Create output directory
os.makedirs("audio_outputs", exist_ok=True)

# Check flash attention availability
try:
    import flash_attn
    ATTN_IMPL = "flash_attention_2"
    print("\n✅ Flash Attention 2 available")
except ImportError:
    ATTN_IMPL = "eager"
    print("\n⚠️ Flash Attention not available, using standard attention")

PyTorch: 2.10.0+cu128
CUDA available: True
GPU: NVIDIA GeForce RTX 3080 Ti
VRAM: 12.5 GB

✅ Flash Attention 2 available


In [2]:
# Helper function for generating and playing audio
def play_audio(wav, sr, filename=None, title=None):
    """Save and display audio with optional title"""
    if filename:
        filepath = f"audio_outputs/{filename}.wav"
        sf.write(filepath, wav, sr)
    if title:
        display(Markdown(f"**{title}**"))
    display(Audio(wav, rate=sr))

In [3]:
from qwen_tts import Qwen3TTSModel

# Load VoiceDesign model
print("🎨 Loading VoiceDesign model...")
voice_design_model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation=ATTN_IMPL,
)
print("✅ VoiceDesign model loaded!")

  from .autonotebook import tqdm as notebook_tqdm
/bin/sh: 1: sox: not found
SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    


🎨 Loading VoiceDesign model...


You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Fetching 4 files: 100%|██████████| 4/4 [00:42<00:00, 10.58s/it]


✅ VoiceDesign model loaded!


In [4]:
# Example 1: Cute anime-style voice
print("🎀 Generating: Cute anime girl voice\n")

wavs, sr = voice_design_model.generate_voice_design(
    text="Hello everyone! I'm so excited to meet you all today! Let's have lots of fun together!",
    language="English",
    instruct="Young female voice, around 16 years old, very cute and energetic. High-pitched with a bright, cheerful tone. Speaks with enthusiasm and excitement, like an anime character."
)
play_audio(wavs[0], sr, "vd_01_anime_girl", "🎀 Cute Anime Girl Voice")

Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


🎀 Generating: Cute anime girl voice



**🎀 Cute Anime Girl Voice**

In [5]:
# Example 2: Deep authoritative narrator
print("📚 Generating: Documentary narrator voice\n")

wavs, sr = voice_design_model.generate_voice_design(
    text="In the depths of the ocean, where sunlight cannot reach, lies a world of extraordinary creatures.",
    language="English",
    instruct="Male voice, 50-60 years old, deep baritone. Speaks slowly and deliberately with gravitas. Professional documentary narrator style, like David Attenborough. Calm, authoritative, and captivating."
)
play_audio(wavs[0], sr, "vd_02_narrator", "📚 Documentary Narrator")

Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


📚 Generating: Documentary narrator voice



**📚 Documentary Narrator**

In [6]:
# Free up memory and load CustomVoice model
del voice_design_model
torch.cuda.empty_cache()

print("🎭 Loading CustomVoice 1.7B model...")
custom_voice_model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation=ATTN_IMPL,
)
print("✅ CustomVoice 1.7B model loaded!")

# Show available speakers
speakers = custom_voice_model.get_supported_speakers()
print(f"\n🎤 Available speakers: {', '.join(speakers)}")

🎭 Loading CustomVoice 1.7B model...


Fetching 4 files: 100%|██████████| 4/4 [00:40<00:00, 10.19s/it]


✅ CustomVoice 1.7B model loaded!

🎤 Available speakers: aiden, dylan, eric, ono_anna, ryan, serena, sohee, uncle_fu, vivian


In [7]:
# Same text, same speaker, different emotions
emotion_text = "I just found out the news about what happened yesterday."
speaker = "Ryan"

emotions = [
    ("😊 Happy", "Very happy and excited tone"),
    ("😢 Sad", "Sad and melancholic, voice breaking slightly"),
    ("😠 Angry", "Angry and frustrated, speaking forcefully"),
    ("😨 Fearful", "Scared and anxious, voice trembling"),
    ("😐 Neutral", ""),  # No instruction = neutral
]

print(f"🎭 EMOTION CONTROL DEMO")
print(f"Speaker: {speaker}")
print(f"Text: \"{emotion_text}\"\n")
print("="*60)

for emotion_name, instruct in emotions:
    print(f"\n{emotion_name}")
    if instruct:
        print(f"   Instruction: {instruct}")

    wavs, sr = custom_voice_model.generate_custom_voice(
        text=emotion_text,
        language="English",
        speaker=speaker,
        instruct=instruct
    )
    play_audio(wavs[0], sr, f"cv_01_emotion_{emotion_name.split()[1].lower()}")

Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


🎭 EMOTION CONTROL DEMO
Speaker: Ryan
Text: "I just found out the news about what happened yesterday."


😊 Happy
   Instruction: Very happy and excited tone


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



😢 Sad
   Instruction: Sad and melancholic, voice breaking slightly


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



😠 Angry
   Instruction: Angry and frustrated, speaking forcefully


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



😨 Fearful
   Instruction: Scared and anxious, voice trembling


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



😐 Neutral


In [8]:
# Speaking style variations
style_text = "Please be quiet, the baby is sleeping."
speaker = "Serena"

styles = [
    ("🤫 Whispering", "Whispering very softly and quietly"),
    ("📢 Loud", "Speaking loudly and clearly, projecting voice"),
    ("🏃 Fast", "Speaking very quickly, rushed pace"),
    ("🐢 Slow", "Speaking very slowly and deliberately, each word careful"),
    ("🎭 Dramatic", "Very dramatic and theatrical, like in a play"),
]

print(f"🎨 SPEAKING STYLE CONTROL")
print(f"Speaker: {speaker}")
print(f"Text: \"{style_text}\"\n")
print("="*60)

for style_name, instruct in styles:
    print(f"\n{style_name}")
    print(f"   Instruction: {instruct}")

    wavs, sr = custom_voice_model.generate_custom_voice(
        text=style_text,
        language="English",
        speaker=speaker,
        instruct=instruct
    )
    play_audio(wavs[0], sr, f"cv_02_style_{style_name.split()[1].lower()}")

Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


🎨 SPEAKING STYLE CONTROL
Speaker: Serena
Text: "Please be quiet, the baby is sleeping."


🤫 Whispering
   Instruction: Whispering very softly and quietly


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



📢 Loud
   Instruction: Speaking loudly and clearly, projecting voice


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



🏃 Fast
   Instruction: Speaking very quickly, rushed pace


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



🐢 Slow
   Instruction: Speaking very slowly and deliberately, each word careful


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



🎭 Dramatic
   Instruction: Very dramatic and theatrical, like in a play


In [9]:
# Role-play scenarios with different speakers and instructions
roleplay_scenarios = [
    (
        "Ryan",
        "Welcome to tonight's championship game! The tension here is absolutely electric!",
        "Sports commentator, very energetic and excited, building hype",
        "🏆 Sports Commentator"
    ),
    (
        "Serena",
        "If you look at slide three, you'll see our quarterly revenue has increased by fifteen percent.",
        "Professional business presentation, confident and clear, corporate tone",
        "💼 Business Presenter"
    ),
    (
        "Aiden",
        "And then the dragon turned to face our party... roll for initiative!",
        "Dungeon master narrating an RPG game, mysterious and dramatic, building suspense",
        "🐉 Dungeon Master"
    ),
    (
        "Ono_Anna",
        "今日のレシピは、簡単で美味しいオムライスです！",
        "Cheerful cooking show host, warm and inviting, enthusiastic about food",
        "👩‍🍳 Cooking Show Host"
    ),
]

print("🎬 ROLE-PLAY SCENARIOS\n")
print("="*60)

for speaker, text, instruct, title in roleplay_scenarios:
    print(f"\n{title}")
    print(f"   Speaker: {speaker}")
    print(f"   Text: {text[:50]}...")

    lang = "Japanese" if speaker == "Ono_Anna" else "English"

    wavs, sr = custom_voice_model.generate_custom_voice(
        text=text,
        language=lang,
        speaker=speaker,
        instruct=instruct
    )
    play_audio(wavs[0], sr, f"cv_04_roleplay_{title.split()[1].lower()}")

Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


🎬 ROLE-PLAY SCENARIOS


🏆 Sports Commentator
   Speaker: Ryan
   Text: Welcome to tonight's championship game! The tensio...


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



💼 Business Presenter
   Speaker: Serena
   Text: If you look at slide three, you'll see our quarter...


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



🐉 Dungeon Master
   Speaker: Aiden
   Text: And then the dragon turned to face our party... ro...


Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.



👩‍🍳 Cooking Show Host
   Speaker: Ono_Anna
   Text: 今日のレシピは、簡単で美味しいオムライスです！...
