# Text-to-Audio Project Demo

This notebook provides a comprehensive demonstration of the text-to-audio conversion system using lightweight Hugging Face models.

## Features Covered:
- Text preprocessing and tokenization
- TTS model integration
- Audio processing and output
- Dataset management
- Interactive examples with visualizations

## Setup and Imports

In [None]:
import sys
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, display, HTML
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
project_root = Path.cwd().parent if Path.cwd().name == 'examples' else Path.cwd()
sys.path.append(str(project_root / "src"))

# Import our modules
from text_processor import TextProcessor
from tts_model import TTSModelManager
from audio_processor import AudioProcessor
from main import TextToAudioConverter
from dataset_manager import DatasetManager

print("✅ All imports successful!")
print(f"Project root: {project_root}")

## 1. Text Processing Demo

In [None]:
# Initialize text processor
text_processor = TextProcessor()

# Sample text with various challenges
sample_text = "Hello Dr. Smith! Today is 25°C. Please visit www.example.com at 3:30 PM on Jan. 1st, 2024. We have 5 cats & 3 dogs."

print("📝 Text Processing Demo")
print("=" * 30)
print(f"Original text: {sample_text}")
print()

# Clean text
cleaned_text = text_processor.clean_text(sample_text)
print(f"Cleaned text: {cleaned_text}")
print()

# Get text statistics
stats = text_processor.get_text_stats(sample_text)
print("Text Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")
print()

# Preprocess for TTS
tts_chunks = text_processor.preprocess_for_tts(sample_text)
print(f"TTS Chunks ({len(tts_chunks)}):")
for i, chunk in enumerate(tts_chunks, 1):
    print(f"  {i}. {chunk}")

## 2. TTS Model Integration Demo

In [None]:
# Initialize TTS model
print("🤖 Initializing TTS Model...")
tts_model = TTSModelManager()

# Get model information
model_info = tts_model.get_model_info()
print("\n📊 Model Information:")
for key, value in model_info.items():
    print(f"  {key}: {value}")

# List available models
available_models = tts_model.list_available_models()
print(f"\n🔍 Available Models: {available_models}")

In [None]:
# Generate speech from text
test_text = "This is a demonstration of text to speech synthesis using Hugging Face models."
print(f"🎵 Generating audio for: '{test_text}'")

# Synthesize speech
audio_data = tts_model.synthesize_speech(test_text)

if audio_data is not None:
    print(f"✅ Audio generated successfully!")
    print(f"   Shape: {audio_data.shape}")
    print(f"   Duration: {len(audio_data) / tts_model.get_sample_rate():.2f} seconds")
    
    # Play audio in notebook
    display(Audio(audio_data, rate=tts_model.get_sample_rate()))
else:
    print("❌ Audio generation failed")

## 3. Audio Processing and Visualization

In [None]:
# Initialize audio processor
audio_processor = AudioProcessor()

if audio_data is not None:
    # Get audio information
    audio_info = audio_processor.get_audio_info(audio_data, tts_model.get_sample_rate())
    print("🔊 Audio Information:")
    for key, value in audio_info.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")
    
    # Apply audio processing
    print("\n🎛️ Applying audio processing...")
    processed_audio = audio_processor.apply_fade(audio_data, sample_rate=tts_model.get_sample_rate())
    processed_audio = audio_processor.normalize_audio(processed_audio)
    
    print("✅ Audio processing completed")

In [None]:
# Visualize audio waveform and spectrogram
if audio_data is not None:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Original waveform
    axes[0, 0].plot(audio_data)
    axes[0, 0].set_title('Original Audio Waveform')
    axes[0, 0].set_xlabel('Samples')
    axes[0, 0].set_ylabel('Amplitude')
    
    # Processed waveform
    axes[0, 1].plot(processed_audio)
    axes[0, 1].set_title('Processed Audio Waveform')
    axes[0, 1].set_xlabel('Samples')
    axes[0, 1].set_ylabel('Amplitude')
    
    # Spectrogram
    D = librosa.stft(audio_data)
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    librosa.display.specshow(S_db, x_axis='time', y_axis='hz', ax=axes[1, 0])
    axes[1, 0].set_title('Spectrogram')
    
    # Mel spectrogram
    S = librosa.feature.melspectrogram(y=audio_data, sr=tts_model.get_sample_rate())
    S_db_mel = librosa.amplitude_to_db(S, ref=np.max)
    librosa.display.specshow(S_db_mel, x_axis='time', y_axis='mel', ax=axes[1, 1])
    axes[1, 1].set_title('Mel Spectrogram')
    
    plt.tight_layout()
    plt.show()

    # Save the processed audio
    output_path = audio_processor.save_audio(
        processed_audio, 
        "notebook_demo", 
        tts_model.get_sample_rate()
    )
    print(f"\n💾 Audio saved to: {output_path}")

## 4. Complete Text-to-Audio Pipeline

In [None]:
# Initialize the complete converter
print("🔧 Initializing Text-to-Audio Converter...")
converter = TextToAudioConverter()

# Get system information
system_info = converter.get_system_info()
print("\n📋 System Configuration:")
print(f"  Model: {system_info['model_name']}")
print(f"  Device: {system_info['device']}")
print(f"  Output Directory: {system_info['output_dir']}")
print(f"  Audio Format: {system_info['config']['audio_format']}")

# Component status
components = system_info['components_initialized']
print("\n🧩 Component Status:")
for component, status in components.items():
    status_icon = "✅" if status else "❌"
    print(f"  {status_icon} {component}")

In [None]:
# Demo: Convert various types of text
demo_texts = [
    "Hello! This is a simple greeting.",
    "The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.",
    "In 2024, AI technology advanced significantly. Dr. Johnson's research at MIT showed promising results.",
    "Welcome to our interactive demo! Today we'll explore text-to-speech synthesis using state-of-the-art models."
]

print("🎯 Converting Multiple Text Examples...")
print("=" * 40)

audio_outputs = []

for i, text in enumerate(demo_texts, 1):
    print(f"\n📝 Example {i}: {text[:50]}...")
    
    # Convert text to audio
    output_path = converter.convert_text(text, f"demo_example_{i}")
    
    if output_path:
        print(f"✅ Success: {Path(output_path).name}")
        
        # Load and display audio
        audio_data, sample_rate = audio_processor.load_audio(output_path)
        audio_outputs.append((audio_data, sample_rate, text))
        
        # Show audio player
        display(HTML(f"<b>🎵 Audio for Example {i}:</b>"))
        display(Audio(audio_data, rate=sample_rate))
        
        # Quick statistics
        duration = len(audio_data) / sample_rate
        word_count = len(text.split())
        print(f"   Duration: {duration:.2f}s, Words: {word_count}, Rate: {word_count/duration*60:.0f} words/min")
    else:
        print(f"❌ Failed to convert example {i}")

print(f"\n🎉 Completed {len([a for a in audio_outputs if a])} out of {len(demo_texts)} conversions!")

## 5. Dataset Management Demo

In [None]:
# Initialize dataset manager
print("📊 Dataset Management Demo")
print("=" * 30)

dataset_manager = DatasetManager()

# List available datasets
available_datasets = dataset_manager.list_available_datasets()
print("Available Datasets:")
for name, info in available_datasets.items():
    print(f"  📁 {name}: {info['name']}")
    print(f"     Type: {info['type']}, Language: {info['language']}")
    print(f"     Sample Rate: {info['sample_rate']} Hz, Size: {info['size']}")
    print()

In [None]:
# Load a small sample of LJSpeech for demonstration
print("📥 Loading LJSpeech dataset (limited sample)...")

try:
    # Load just a few samples for demo purposes
    ljspeech_sample = dataset_manager.load_ljspeech(max_samples=5)
    
    if ljspeech_sample:
        print(f"✅ Loaded {len(ljspeech_sample)} samples")
        
        # Show sample data
        print("\n📋 Sample Data:")
        for i in range(min(3, len(ljspeech_sample))):
            sample = ljspeech_sample[i]
            print(f"\nSample {i+1}:")
            print(f"  Text: {sample['text'][:100]}...")
            if 'audio' in sample:
                audio_info = sample['audio']
                duration = len(audio_info['array']) / audio_info['sampling_rate']
                print(f"  Audio: {duration:.2f}s, {audio_info['sampling_rate']} Hz")
                
                # Play first sample
                if i == 0:
                    print("  🎵 Playing first sample:")
                    display(Audio(audio_info['array'], rate=audio_info['sampling_rate']))
        
        # Get dataset statistics
        stats = dataset_manager.get_dataset_stats('ljspeech')
        if stats:
            print("\n📊 Dataset Statistics:")
            print(f"  Total samples: {stats['total_samples']}")
            if 'text_stats' in stats:
                ts = stats['text_stats']
                print(f"  Avg words per text: {ts['avg_words']:.1f}")
                print(f"  Text length range: {ts['min_chars']}-{ts['max_chars']} chars")
            if 'audio_stats' in stats:
                aus = stats['audio_stats']
                print(f"  Avg duration: {aus['avg_duration']:.2f}s")
                print(f"  Sample rates: {aus['sample_rates']}")
    else:
        print("❌ Failed to load dataset")
        
except Exception as e:
    print(f"⚠️ Dataset loading encountered an issue: {e}")
    print("This is normal for demo purposes - full datasets are large!")

## 6. Interactive Audio Comparison

In [None]:
# Compare different processing settings
print("🔄 Audio Processing Comparison")
print("=" * 35)

comparison_text = "This text will be processed with different audio settings to demonstrate the effects of various processing options."

# Different configurations to test
configs = [
    {"name": "Default", "settings": {}},
    {"name": "With Fade", "settings": {"apply_fade": True}},
    {"name": "Normalized", "settings": {"normalize_audio": True}},
    {"name": "Full Processing", "settings": {"apply_fade": True, "normalize_audio": True, "noise_reduction": True}}
]

comparison_results = []

for config in configs:
    print(f"\n🎛️ Testing: {config['name']}")
    
    # Update converter configuration
    converter.update_config(**config['settings'])
    
    # Convert text
    output_path = converter.convert_text(comparison_text, f"comparison_{config['name'].lower().replace(' ', '_')}")
    
    if output_path:
        audio_data, sample_rate = audio_processor.load_audio(output_path)
        comparison_results.append({
            "name": config['name'],
            "audio": audio_data,
            "sample_rate": sample_rate,
            "path": output_path
        })
        
        print(f"✅ Generated: {Path(output_path).name}")
        
        # Display audio player
        display(HTML(f"<b>🎵 {config['name']} Processing:</b>"))
        display(Audio(audio_data, rate=sample_rate))
    else:
        print(f"❌ Failed: {config['name']}")

print(f"\n🎉 Generated {len(comparison_results)} comparison samples!")

## 7. Performance Analysis

In [None]:
import time

# Performance testing
print("⚡ Performance Analysis")
print("=" * 25)

# Test different text lengths
test_texts = [
    "Short text.",
    "This is a medium length text that contains several words and should take a moderate amount of time to process.",
    "This is a much longer text sample that we will use to test the performance of our text-to-speech system with extended content. It contains multiple sentences and should provide a good benchmark for processing longer documents. The system should handle this efficiently while maintaining good audio quality throughout the entire conversion process."
]

performance_results = []

for i, text in enumerate(test_texts, 1):
    word_count = len(text.split())
    char_count = len(text)
    
    print(f"\n📝 Test {i}: {word_count} words, {char_count} characters")
    
    # Time the conversion
    start_time = time.time()
    output_path = converter.convert_text(text, f"perf_test_{i}")
    end_time = time.time()
    
    conversion_time = end_time - start_time
    
    if output_path:
        # Get audio duration
        audio_data, sample_rate = audio_processor.load_audio(output_path)
        audio_duration = len(audio_data) / sample_rate
        
        # Calculate metrics
        real_time_factor = conversion_time / audio_duration
        words_per_second = word_count / conversion_time
        
        result = {
            "test_id": i,
            "word_count": word_count,
            "char_count": char_count,
            "conversion_time": conversion_time,
            "audio_duration": audio_duration,
            "real_time_factor": real_time_factor,
            "words_per_second": words_per_second
        }
        
        performance_results.append(result)
        
        print(f"   ⏱️ Conversion time: {conversion_time:.2f}s")
        print(f"   🎵 Audio duration: {audio_duration:.2f}s")
        print(f"   📊 Real-time factor: {real_time_factor:.2f}x")
        print(f"   🚀 Processing speed: {words_per_second:.1f} words/sec")
    else:
        print(f"   ❌ Conversion failed")

# Summary
if performance_results:
    avg_rtf = np.mean([r['real_time_factor'] for r in performance_results])
    avg_wps = np.mean([r['words_per_second'] for r in performance_results])
    
    print(f"\n📊 Average Performance:")
    print(f"   Real-time factor: {avg_rtf:.2f}x")
    print(f"   Processing speed: {avg_wps:.1f} words/sec")
    
    if avg_rtf < 1.0:
        print("   ✅ System runs faster than real-time!")
    else:
        print("   ⏳ System runs slower than real-time")

## 8. Cleanup and Summary

In [None]:
# Cleanup resources
print("🧹 Cleaning up resources...")

if 'tts_model' in locals():
    tts_model.cleanup()

if 'converter' in locals():
    converter.cleanup()

if 'audio_processor' in locals():
    audio_processor.cleanup()

print("✅ Cleanup completed!")

# Summary
print("\n🎉 Notebook Demo Summary")
print("=" * 30)
print("Features demonstrated:")
print("  ✅ Text preprocessing and tokenization")
print("  ✅ TTS model integration")
print("  ✅ Audio processing and visualization")
print("  ✅ Complete text-to-audio pipeline")
print("  ✅ Dataset management")
print("  ✅ Interactive audio comparison")
print("  ✅ Performance analysis")
print("\n🚀 The text-to-audio system is ready for use!")

# List generated files
output_dir = Path("../output")
if output_dir.exists():
    audio_files = list(output_dir.glob("*.wav"))
    if audio_files:
        print(f"\n📁 Generated {len(audio_files)} audio files:")
        for file_path in sorted(audio_files)[-5:]:  # Show last 5 files
            size_kb = file_path.stat().st_size / 1024
            print(f"   📄 {file_path.name} ({size_kb:.1f} KB)")
        if len(audio_files) > 5:
            print(f"   ... and {len(audio_files) - 5} more files")