In [None]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from utils.data_loader import LibriSpeechDataLoader
import config

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# Initialize data loader
print("Initializing LibriSpeech Data Loader...")
data_loader = LibriSpeechDataLoader(config.DATASET_CONFIG)

# Load dataset
print("\nLoading LibriSpeech dataset...")
datasets = data_loader.load_dataset(splits=['train', 'validation', 'test'])

# Display dataset information
data_loader.get_dataset_info(datasets)

# Save character mappings
print("\nSaving character mappings...")
np.save('char_to_num.npy', data_loader.char_to_num)
np.save('num_to_char.npy', data_loader.num_to_char)

# Save character mapping as text file
with open('character_mapping.txt', 'w') as f:
    f.write("Character to Number Mapping:\n")
    f.write("=" * 30 + "\n")
    for char, num in sorted(data_loader.char_to_num.items()):
        f.write(f"'{char}': {num}\n")

print("Character mappings saved!")
print(f"Vocabulary size: {len(data_loader.char_to_num)}")

# Visualize some samples
print("\nVisualizing sample data...")
plt.figure(figsize=(15, 10))

sample_count = 0
for split_name, dataset in datasets.items():
    if dataset is None:
        continue
        
    for example in dataset.take(2):
        if sample_count >= 6:
            break
            
        audio = example['audio'].numpy()
        text = example['text'].numpy().decode('utf-8')
        
        plt.subplot(3, 2, sample_count + 1)
        plt.plot(audio)
        plt.title(f'{split_name.capitalize()}\n"{text[:50]}{"..." if len(text) > 50 else ""}"')
        plt.xlabel('Samples')
        plt.ylabel('Amplitude')
        
        sample_count += 1

plt.tight_layout()
plt.savefig('data_samples.png', dpi=300, bbox_inches='tight')
plt.show()

print("Data preprocessing completed!")
print("Next: Run feature extraction notebook")