# Catalyst AI Voice Studio Demo

This notebook demonstrates the basic workflows of the Catalyst AI Voice Studio.

## Setup and Imports

In [None]:
import sys
import os
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

# Import voice studio modules
from catalyst_ai_voice_studio.tts_service import XTTSLoader, OpenVoiceLoader
from catalyst_ai_voice_studio.text_normalizer import TextNormalizer
from catalyst_ai_voice_studio.prosody_planner import ProsodyPlanner
from catalyst_ai_voice_studio.utils.audio_tools import normalize_audio

# For audio playback and visualization
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
from IPython.display import Audio, display

print("Imports successful!")

## 1. Basic Text-to-Speech Synthesis

In [None]:
# Load TTS model
tts = XTTSLoader()
tts.load_model()

print(f"Model loaded: {tts.is_model_loaded()}")
print(f"Sample rate: {tts.sample_rate} Hz")

In [None]:
# Simple synthesis
text = "Hello, world! This is a demonstration of the Catalyst AI Voice Studio."

audio = tts.synthesize(text, voice_id="default")

print(f"Generated audio: {len(audio)} samples")
print(f"Duration: {len(audio) / tts.sample_rate:.2f} seconds")

# Play audio
display(Audio(audio, rate=tts.sample_rate))

## 2. Text Normalization

In [None]:
# Initialize text normalizer
normalizer = TextNormalizer()

# Test text with various elements that need normalization
raw_text = """
Dr. Smith said: "The meeting is at 3:30 PM on Dec. 15th, 2023.
We'll discuss the $1,000,000 budget & Q4 results."
Contact him @ john.smith@company.com or call (555) 123-4567.
"""

normalized_text = normalizer.normalize(raw_text)

print("Original text:")
print(raw_text)
print("\nNormalized text:")
print(normalized_text)

## 3. Prosody Planning

In [None]:
# Initialize prosody planner
prosody_planner = ProsodyPlanner()

# Test text with various prosodic elements
test_text = "Hello, my name is John. I'm *very* excited to meet you! This is IMPORTANT information."

# Plan prosody
markers = prosody_planner.plan_prosody(test_text)

print(f"Found {len(markers)} prosody markers:")
for marker in markers:
    print(f"  Position {marker.position}: {marker.marker_type} (strength: {marker.strength:.2f})")

# Apply prosody
prosody_text = prosody_planner.apply_prosody(test_text, markers)
print("\nText with prosody markup:")
print(prosody_text)

## 4. Complete Pipeline

In [None]:
def synthesize_with_pipeline(text, voice_id="default", model="xtts"):
    """Complete TTS pipeline with normalization and prosody."""
    
    # Step 1: Load model
    if model == "xtts":
        tts_model = XTTSLoader()
    elif model == "openvoice":
        tts_model = OpenVoiceLoader()
    else:
        raise ValueError(f"Unknown model: {model}")
    
    tts_model.load_model()
    
    # Step 2: Normalize text
    normalizer = TextNormalizer()
    normalized_text = normalizer.normalize(text)
    
    # Step 3: Plan prosody
    prosody_planner = ProsodyPlanner()
    markers = prosody_planner.plan_prosody(normalized_text)
    prosody_text = prosody_planner.apply_prosody(normalized_text, markers)
    
    # Step 4: Synthesize
    audio = tts_model.synthesize(prosody_text, voice_id=voice_id)
    
    # Step 5: Post-process audio
    audio = normalize_audio(audio)
    
    return audio, tts_model.sample_rate, {
        "original_text": text,
        "normalized_text": normalized_text,
        "prosody_text": prosody_text,
        "prosody_markers": len(markers),
        "duration": len(audio) / tts_model.sample_rate
    }

# Test the complete pipeline
test_text = """
Welcome to Catalyst AI Voice Studio! This is a *professional-grade* 
voice synthesis platform. It supports multiple TTS backends, 
including XTTS-v2 & OpenVoice. The system can handle complex text 
normalization: numbers like 123, dates like Dec. 15th, 2023, 
and abbreviations like Dr. Smith. Isn't that amazing?
"""

audio, sample_rate, info = synthesize_with_pipeline(test_text)

print("Pipeline Results:")
for key, value in info.items():
    if isinstance(value, str) and len(value) > 100:
        print(f"{key}: {value[:100]}...")
    else:
        print(f"{key}: {value}")

# Play the result
display(Audio(audio, rate=sample_rate))

## 5. Voice Comparison

In [None]:
# Compare different voices
comparison_text = "This is a voice comparison test. How do I sound?"

# Get available voices
voices = tts.get_voices()
print("Available voices:")
for voice_id, info in voices.items():
    print(f"  {voice_id}: {info['name']}")

# Generate audio for each voice
voice_samples = {}
for voice_id in list(voices.keys())[:3]:  # Limit to first 3 voices
    print(f"\nGenerating sample for {voice_id}...")
    audio = tts.synthesize(comparison_text, voice_id=voice_id)
    voice_samples[voice_id] = audio
    
    print(f"Voice: {voices[voice_id]['name']}")
    display(Audio(audio, rate=tts.sample_rate))

## 6. Audio Visualization

In [None]:
# Visualize generated audio
sample_audio = tts.synthesize("This is a sample for visualization.")

# Time axis
time_axis = np.linspace(0, len(sample_audio) / tts.sample_rate, len(sample_audio))

# Create plots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))

# Waveform
ax1.plot(time_axis, sample_audio)
ax1.set_title('Waveform')
ax1.set_xlabel('Time (seconds)')
ax1.set_ylabel('Amplitude')
ax1.grid(True)

# Spectrogram
import librosa
D = librosa.amplitude_to_db(np.abs(librosa.stft(sample_audio)), ref=np.max)
img = librosa.display.specshow(D, y_axis='hz', x_axis='time', sr=tts.sample_rate, ax=ax2)
ax2.set_title('Spectrogram')
fig.colorbar(img, ax=ax2, format='%+2.0f dB')

plt.tight_layout()
plt.show()

# Play the audio
display(Audio(sample_audio, rate=tts.sample_rate))

## 7. Streaming Synthesis

In [None]:
# Demonstrate streaming synthesis
streaming_text = """
This is a demonstration of streaming synthesis. 
The audio is generated in chunks, which allows for 
real-time playback and lower latency applications. 
This is particularly useful for interactive applications 
and live voice assistants.
"""

print("Streaming synthesis...")
chunks = []
chunk_count = 0

for chunk in tts.stream(streaming_text, chunk_size=1024):
    chunks.append(chunk)
    chunk_count += 1
    print(f"Received chunk {chunk_count}: {len(chunk)} samples")

# Concatenate chunks
full_audio = np.concatenate(chunks)

print(f"\nStreaming complete: {chunk_count} chunks, {len(full_audio)} total samples")
print(f"Duration: {len(full_audio) / tts.sample_rate:.2f} seconds")

# Play the result
display(Audio(full_audio, rate=tts.sample_rate))

## 8. Model Comparison

In [None]:
# Compare XTTS and OpenVoice models
comparison_text = "This is a model comparison between XTTS and OpenVoice."

models = {
    "XTTS": XTTSLoader(),
    "OpenVoice": OpenVoiceLoader()
}

model_results = {}

for model_name, model in models.items():
    print(f"\nTesting {model_name}...")
    model.load_model()
    
    import time
    start_time = time.time()
    audio = model.synthesize(comparison_text)
    synthesis_time = time.time() - start_time
    
    model_results[model_name] = {
        "audio": audio,
        "sample_rate": model.sample_rate,
        "synthesis_time": synthesis_time,
        "duration": len(audio) / model.sample_rate,
        "rtf": synthesis_time / (len(audio) / model.sample_rate)
    }
    
    print(f"Sample rate: {model.sample_rate} Hz")
    print(f"Synthesis time: {synthesis_time:.2f}s")
    print(f"Audio duration: {model_results[model_name]['duration']:.2f}s")
    print(f"Real-time factor: {model_results[model_name]['rtf']:.2f}x")
    
    display(Audio(audio, rate=model.sample_rate))

## 9. Save Audio Files

In [None]:
# Save generated audio to files
output_dir = Path("demo_outputs")
output_dir.mkdir(exist_ok=True)

# Save the pipeline result
output_file = output_dir / "pipeline_demo.wav"
sf.write(output_file, audio, sample_rate)
print(f"Saved pipeline demo to: {output_file}")

# Save model comparison results
for model_name, result in model_results.items():
    output_file = output_dir / f"{model_name.lower()}_comparison.wav"
    sf.write(output_file, result["audio"], result["sample_rate"])
    print(f"Saved {model_name} comparison to: {output_file}")

print(f"\nAll demo outputs saved to: {output_dir.absolute()}")

## Summary

This notebook demonstrated the key features of Catalyst AI Voice Studio:

1. **Basic TTS Synthesis** - Simple text-to-speech conversion
2. **Text Normalization** - Converting numbers, abbreviations, and symbols
3. **Prosody Planning** - Adding pauses and emphasis markers
4. **Complete Pipeline** - End-to-end processing workflow
5. **Voice Comparison** - Testing different voice models
6. **Audio Visualization** - Waveform and spectrogram analysis
7. **Streaming Synthesis** - Real-time audio generation
8. **Model Comparison** - Performance comparison between models
9. **File Output** - Saving generated audio

The platform provides a flexible and extensible architecture for professional voice synthesis applications.