In [1]:
!pip install -U qwen-tts soundfile -q

# Install flash-attention for faster inference (optional but recommended)
# This compilation can take 5-10 minutes
!pip install flash-attn --no-build-isolation -q

In [2]:
# Verify GPU and imports
import torch
import soundfile as sf
import os
from IPython.display import Audio, display, Markdown, HTML

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Create output directory
os.makedirs("audio_outputs", exist_ok=True)

# Check flash attention availability
try:
    import flash_attn
    ATTN_IMPL = "flash_attention_2"
    print("\n‚úÖ Flash Attention 2 available")
except ImportError:
    ATTN_IMPL = "eager"
    print("\n‚ö†Ô∏è Flash Attention not available, using standard attention")

PyTorch: 2.10.0+cu128
CUDA available: True
GPU: NVIDIA GeForce RTX 3080 Ti
VRAM: 12.5 GB

‚úÖ Flash Attention 2 available


In [6]:
# Helper function for generating and playing audio
def play_audio(wav, sr, filename=None, title=None):
    """Save and display audio with optional title"""
    if filename:
        filepath = f"audio_outputs/{filename}.wav"
        sf.write(filepath, wav, sr)
    if title:
        display(Markdown(f"**{title}**"))
    display(Audio(wav, rate=sr))

from qwen_tts import Qwen3TTSModel

# Free up memory and load Base model
torch.cuda.empty_cache()

print("üîä Loading Base (Voice Clone) model...")
clone_model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation=ATTN_IMPL,
)
print("‚úÖ Base model loaded!")

  from .autonotebook import tqdm as notebook_tqdm
/bin/sh: 1: sox: not found
SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    


üîä Loading Base (Voice Clone) model...


You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Fetching 4 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:51<00:00, 12.96s/it]


‚úÖ Base model loaded!


In [8]:
ref_text = "Hey everybody this is Dave Erickson and I wanted to record a sample of my voice. This is so that I can clone my voice. It's important that i have the ability to create a digital twin of myself. I am expressive and I speak in an uh eloquent and uh maybe a little bit  informal tone. Thanks"
ref_audio_url = "./DaveSample.m4a"

voice_prompt = clone_model.create_voice_clone_prompt(
    ref_audio=ref_audio_url,
    ref_text=ref_text,
)
print("‚úÖ Voice prompt created!\n")


# Generate multiple sentences efficiently
batch_texts = [
    "First, let me tell you about the basics.",
    "Second, we'll dive into the details.",
    "Third, we'll look at some examples.",
    "Finally, we'll wrap everything up.",
]

print("üì¶ Batch generating with reusable prompt...\n")

wavs, sr = clone_model.generate_voice_clone(
    text=batch_texts,
    language=["English"] * len(batch_texts),
    voice_clone_prompt=voice_prompt,
)

for i, (text, wav) in enumerate(zip(batch_texts, wavs)):
    print(f"üìù {text}")
    play_audio(wav, sr, f"clone_02_batch_{i+1}")

  audio, sr = librosa.load(x, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.


‚úÖ Voice prompt created!

üì¶ Batch generating with reusable prompt...

üìù First, let me tell you about the basics.


üìù Second, we'll dive into the details.


üìù Third, we'll look at some examples.


üìù Finally, we'll wrap everything up.
