In [6]:
from typing import Any
import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    VitsModel,
    AutoTokenizer,
    pipeline,
)
from openai import OpenAI  # required by the rest of the file

# Device configuration
_IS_CUDA = torch.cuda.is_available()
_DEVICE = torch.device("cuda:0" if _IS_CUDA else "cpu")
_DTYPE = torch.float16 if _IS_CUDA else torch.float32

# Model identifiers
_ASR_MODEL_ID = "openai/whisper-large-v3"
_TTS_MODEL_ID = "facebook/mms-tts-eng"

# Lazy-loaded globals
_ASR_PIPELINE = None
_TTS_MODEL = None
_TTS_TOKENIZER = None


def _ensure_tts():
    global _TTS_MODEL, _TTS_TOKENIZER
    if _TTS_MODEL is None or _TTS_TOKENIZER is None:
        _TTS_TOKENIZER = AutoTokenizer.from_pretrained(_TTS_MODEL_ID)
        _TTS_MODEL = VitsModel.from_pretrained(_TTS_MODEL_ID).to(_DEVICE).eval()
    return _TTS_MODEL, _TTS_TOKENIZER


def text_to_speech(text: str, voice: str = ""):
    """Synthesize speech waveform from text using MMS-TTS.
    Returns a mono float32 numpy array at model.config.sampling_rate.
    """
    model, tokenizer = _ensure_tts()

    if not text or not text.strip():
        return torch.zeros(0, dtype=torch.float32).cpu().numpy()

    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Note: `voice` is unused for facebook/mms-tts-eng (single speaker)
    with torch.inference_mode():
        audio = model(**inputs).waveform

    audio = audio.detach().cpu().numpy().astype("float32")
    return audio

In [7]:
from IPython.display import Audio
meh = text_to_speech("Hello, world!")
Audio(meh, rate=22050)

In [10]:
from typing import Any
import numpy as np
import torch
from transformers import pipeline
from openai import OpenAI  # required by the rest of the file

# Device configuration
_IS_CUDA = torch.cuda.is_available()
_DEVICE = torch.device("cuda:0" if _IS_CUDA else "cpu")

# Model identifiers
_ASR_MODEL_ID = "openai/whisper-large-v3"
_TTS_MODEL_ID = "facebook/mms-tts-eng"

# Lazy-loaded globals
_ASR_PIPELINE = None
_TTS_PIPELINE = None


def _ensure_tts():
    """Lazily create a text-to-speech pipeline in **float32** to avoid dtype mismatch."""
    global _TTS_PIPELINE
    if _TTS_PIPELINE is None:
        # Prefer "text-to-speech"; fall back to "text-to-audio" for older Transformers
        try:
            _TTS_PIPELINE = pipeline(
                task="text-to-speech",
                model=_TTS_MODEL_ID,
                device=0 if _IS_CUDA else -1,
                torch_dtype=torch.float32,              # <= critical: keep FP32
            )
        except Exception:
            _TTS_PIPELINE = pipeline(
                task="text-to-audio",
                model=_TTS_MODEL_ID,
                device=0 if _IS_CUDA else -1,
                torch_dtype=torch.float32,
            )
    return _TTS_PIPELINE


def text_to_speech(text: str, voice: str = ""):
    """Synthesize speech using MMS-TTS via pipeline.
    Returns a mono float32 numpy array. Sampling rate is in out['sampling_rate'].
    """
    if not text or not text.strip():
        return np.zeros(0, dtype=np.float32)

    tts = _ensure_tts()
    out = tts(text)  # dict: {'audio': np.ndarray, 'sampling_rate': int}

    audio = out["audio"]
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)
    return audio


In [13]:
from IPython.display import Audio
meh = text_to_speech("Hello, world!")
Audio(meh, rate=20000)