# Muyan-TTS Text-to-Speech Test

This notebook demonstrates how to use Muyan-TTS to convert text to speech.

In [None]:
import torch
import soundfile as sf
from transformers import AutoTokenizer, AutoModel
import numpy as np

In [None]:
# Load Muyan-TTS model and tokenizer
model_name = "Muyan/MuyanTTS"  # Replace with actual model name if different

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please verify the correct model name for Muyan-TTS")

In [None]:
# Text to convert to speech
text = "Hello, this is a test of the Muyan text-to-speech system. How does it sound?"

print(f"Text to synthesize: {text}")

In [None]:
# Generate speech from text
def text_to_speech(text, model, tokenizer):
    """
    Convert text to speech using Muyan-TTS model
    """
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Generate audio
    with torch.no_grad():
        # Note: This is a generic approach - actual implementation may vary
        # depending on the specific Muyan-TTS model architecture
        outputs = model(**inputs)
        
        # Extract audio data (this may need adjustment based on model output format)
        if hasattr(outputs, 'audio'):
            audio = outputs.audio
        elif hasattr(outputs, 'last_hidden_state'):
            audio = outputs.last_hidden_state
        else:
            audio = outputs[0]  # First output tensor
    
    return audio

# Generate speech
try:
    audio_output = text_to_speech(text, model, tokenizer)
    print(f"Audio generated! Shape: {audio_output.shape}")
except Exception as e:
    print(f"Error generating speech: {e}")

In [None]:
# Save audio to file
output_file = "muyan_tts_output.wav"
sample_rate = 22050  # Common sample rate for TTS models

try:
    # Convert to numpy array and ensure correct format
    audio_np = audio_output.squeeze().cpu().numpy()
    
    # Normalize audio if needed
    if audio_np.max() > 1.0 or audio_np.min() < -1.0:
        audio_np = audio_np / np.max(np.abs(audio_np))
    
    # Save as WAV file
    sf.write(output_file, audio_np, sample_rate)
    print(f"Audio saved to: {output_file}")
    
except Exception as e:
    print(f"Error saving audio: {e}")
    print("Audio output format may need adjustment for this specific model")

In [None]:
# Optional: Play audio in notebook (requires IPython.display)
try:
    from IPython.display import Audio, display
    
    # Display audio player
    audio_widget = Audio(filename=output_file)
    display(audio_widget)
    
except ImportError:
    print("IPython not available for audio playback")
except Exception as e:
    print(f"Could not play audio: {e}")

## Notes

- This notebook provides a basic framework for using Muyan-TTS
- The actual model loading and inference code may need adjustment based on the specific Muyan-TTS implementation
- Check the model documentation for exact parameter names and output formats
- Adjust sample rate and audio processing as needed for your specific use case