# AudioToText Pipeline Demo

## This notebook demonstrates how to use the TextToAudio pipeline to convert text into speech using a text-to-speech model.


In [2]:
import logging
from datasets import load_dataset
from huggingface_pipelines.speech import AudioToTextHFPipeline, AudioPipelineConfig
from huggingface_pipelines.dataset import DatasetConfig
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
import numpy as np




logging.basicConfig(level=logging.INFO)



In [4]:
dataset_config = DatasetConfig(
        dataset_name="mozilla-foundation/common_voice_11_0",
        dataset_split="test",
        trust_remote_code = True,
        config = "en",
    )

dataset = dataset_config.load_dataset()

Downloading builder script: 100%|██████████| 8.13k/8.13k [00:00<00:00, 37.8MB/s]
Downloading readme: 100%|██████████| 14.4k/14.4k [00:00<00:00, 32.7MB/s]
Downloading extra modules: 100%|██████████| 3.44k/3.44k [00:00<00:00, 16.1MB/s]
Downloading extra modules: 100%|██████████| 60.9k/60.9k [00:00<00:00, 832kB/s]
Downloading data: 100%|██████████| 12.2k/12.2k [00:00<00:00, 26.4MB/s]
Downloading data:   0%|          | 0/24 [02:12<?, ?files/s]


## Configure Pipeline

In [None]:
config = AudioPipelineConfig(
    dataset_name="common_voice",
    columns=["audio"]
).with_overwrites({
    "encoder_model": "sonar_speech_encoder_eng",
    "decoder_model": "text_sonar_basic_decoder",
    "target_lang": "eng_Latn",
    "batch_size": 1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "output_file_name": "transcription_results",
    "take": 1
})

## Initialize and Run Pipeline

In [None]:
pipeline = AudioToTextHFPipeline(config)
dataset = pipeline(dataset)

## Display Results

In [None]:
for original, transcribed in zip(dataset['sentence'][:5], processed_dataset['audio_transcribed'][:5]):
    print(f"Original: {original}")
    print(f"Transcribed: {transcribed}")
    print()


## Visualize and Play Audio Sample

In [None]:
def plot_audio_and_transcription(audio, transcription, sample_rate):
    plt.figure(figsize=(12, 4))
    librosa.display.waveshow(audio, sr=sample_rate)
    plt.title(f'Audio Waveform\nTranscription: {transcription}')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.show()
    
    ipd.display(ipd.Audio(audio, rate=sample_rate))

sample_audio = dataset['audio'][0]['array']
sample_rate = dataset['audio'][0]['sampling_rate']
sample_transcription = processed_dataset['audio_transcribed'][0]

plot_audio_and_transcription(sample_audio, sample_transcription, sample_rate)

## Configure and Run MetricAnalyzer Pipeline

In [None]:
metric_config = MetricPipelineConfig(
    dataset_name="common_voice",
    dataset_split="test",
    columns=["audio_transcribed"]
).with_overwrites({
    "batch_size": 32,
    "device": "cpu",
    "metric_name": "wer",  # Word Error Rate
    "low_score_threshold": 0.5,  # Adjust as needed
    "output_file_name": "transcription_metrics",
    "take": 100
})

In [None]:
metric_pipeline = MetricAnalyzerPipeline(metric_config)
dataset = metric_pipeline(transcribed_dataset)

## Analyze Metric Scores

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(evaluated_dataset['wer'], bins=20)
plt.title('Distribution of Word Error Rates')
plt.xlabel('Word Error Rate')
plt.ylabel('Frequency')
plt.axvline(x=metric_config.low_score_threshold, color='r', linestyle='--', label='Low Score Threshold')
plt.legend()
plt.show()

print(f"Average WER: {sum(evaluated_dataset['wer']) / len(evaluated_dataset['wer']):.4f}")


## Analyze Low-Scoring Samples

In [None]:
low_scoring = evaluated_dataset.filter(lambda x: x['wer'] > metric_config.low_score_threshold)
print(f"Number of low-scoring samples: {len(low_scoring)}")

for sample in low_scoring[:5]:
    print(f"Original: {sample['sentence']}")
    print(f"Transcribed: {sample['audio_transcribed']}")
    print(f"WER: {sample['wer']:.4f}")
    print()